Adding upstream version 4.19.249.upstream/4.19.249 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
commit: 76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree: f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/powerpc/kernel
parent: Initial commit. (diff)
download: linux-c109f8d9e922037b3fa45f46d78384d49db8ad76.tar.xz
linux-c109f8d9e922037b3fa45f46d78384d49db8ad76.zip
177 files changed, 85499 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore
new file mode 100644
index 000000000..c5f676c3c
--- /dev/null
+++ b/arch/powerpc/kernel/.gitignore
@@ -0,0 +1 @@
+vmlinux.lds
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
new file mode 100644
index 000000000..bf19c5514
--- /dev/null
+++ b/arch/powerpc/kernel/Makefile
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux kernel.
+#
+
+CFLAGS_ptrace.o		+= -DUTS_MACHINE='"$(UTS_MACHINE)"'
+
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+ifdef CONFIG_PPC64
+CFLAGS_prom_init.o	+= $(NO_MINIMAL_TOC)
+endif
+ifdef CONFIG_PPC32
+CFLAGS_prom_init.o      += -fPIC
+CFLAGS_btext.o		+= -fPIC
+endif
+
+CFLAGS_setup_32.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not trace early boot code
+CFLAGS_REMOVE_cputable.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_prom_init.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE)
+endif
+
+obj-y				:= cputable.o ptrace.o syscalls.o \
+				   irq.o align.o signal_32.o pmc.o vdso.o \
+				   process.o systbl.o idle.o \
+				   signal.o sysfs.o cacheinfo.o time.o \
+				   prom.o traps.o setup-common.o \
+				   udbg.o misc.o io.o dma.o misc_$(BITS).o \
+				   of_platform.o prom_parse.o
+obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
+				   signal_64.o ptrace32.o \
+				   paca.o nvram_64.o firmware.o
+obj-$(CONFIG_VDSO32)		+= vdso32/
+obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
+obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
+obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o
+obj-$(CONFIG_PPC64)		+= vdso64/
+obj-$(CONFIG_ALTIVEC)		+= vecemu.o
+obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
+obj-$(CONFIG_PPC_P7_NAP)	+= idle_book3s.o
+procfs-y			:= proc_powerpc.o
+obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
+rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
+obj-$(CONFIG_PPC_RTAS)		+= rtas.o rtas-rtc.o $(rtaspci-y-y)
+obj-$(CONFIG_PPC_RTAS_DAEMON)	+= rtasd.o
+obj-$(CONFIG_RTAS_FLASH)	+= rtas_flash.o
+obj-$(CONFIG_RTAS_PROC)		+= rtas-proc.o
+obj-$(CONFIG_PPC_DT_CPU_FTRS)	+= dt_cpu_ftrs.o
+obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+				  eeh_driver.o eeh_event.o eeh_sysfs.o
+obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
+obj-$(CONFIG_FA_DUMP)		+= fadump.o
+ifdef CONFIG_PPC32
+obj-$(CONFIG_E500)		+= idle_e500.o
+endif
+obj-$(CONFIG_6xx)		+= idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o
+obj-$(CONFIG_TAU)		+= tau_6xx.o
+obj-$(CONFIG_HIBERNATION)	+= swsusp.o suspend.o
+ifdef CONFIG_FSL_BOOKE
+obj-$(CONFIG_HIBERNATION)	+= swsusp_booke.o
+else
+obj-$(CONFIG_HIBERNATION)	+= swsusp_$(BITS).o
+endif
+obj64-$(CONFIG_HIBERNATION)	+= swsusp_asm64.o
+obj-$(CONFIG_MODULES)		+= module.o module_$(BITS).o
+obj-$(CONFIG_44x)		+= cpu_setup_44x.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= cpu_setup_fsl_booke.o
+obj-$(CONFIG_PPC_DOORBELL)	+= dbell.o
+obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
+
+extra-y				:= head_$(BITS).o
+extra-$(CONFIG_40x)		:= head_40x.o
+extra-$(CONFIG_44x)		:= head_44x.o
+extra-$(CONFIG_FSL_BOOKE)	:= head_fsl_booke.o
+extra-$(CONFIG_PPC_8xx)		:= head_8xx.o
+extra-y				+= vmlinux.lds
+
+obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
+
+obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o
+obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
+obj-$(CONFIG_KGDB)		+= kgdb.o
+obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
+obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_OPTPROBES)		+= optprobes.o optprobes_head.o
+obj-$(CONFIG_KPROBES_ON_FTRACE)	+= kprobes-ftrace.o
+obj-$(CONFIG_UPROBES)		+= uprobes.o
+obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
+obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
+obj-$(CONFIG_SWIOTLB)		+= dma-swiotlb.o
+
+pci64-$(CONFIG_PPC64)		+= pci_dn.o pci-hotplug.o isa-bridge.o
+obj-$(CONFIG_PCI)		+= pci_$(BITS).o $(pci64-y) \
+				   pci-common.o pci_of_scan.o
+obj-$(CONFIG_PCI_MSI)		+= msi.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o crash.o \
+				   machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o
+ifdef CONFIG_HAVE_IMA_KEXEC
+ifdef CONFIG_IMA
+obj-y				+= ima_kexec.o
+endif
+endif
+
+obj-$(CONFIG_AUDIT)		+= audit.o
+obj64-$(CONFIG_AUDIT)		+= compat_audit.o
+
+obj-$(CONFIG_PPC_IO_WORKAROUNDS)	+= io-workarounds.o
+
+obj-y				+= trace/
+
+ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
+obj-y				+= iomap.o
+endif
+
+obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM)	+= tm.o
+
+obj-$(CONFIG_PPC64)		+= $(obj64-y)
+obj-$(CONFIG_PPC32)		+= $(obj32-y)
+
+ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),)
+obj-y				+= ppc_save_regs.o
+endif
+
+obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
+obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
+
+# Disable GCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_prom_init.o := n
+UBSAN_SANITIZE_prom_init.o := n
+GCOV_PROFILE_machine_kexec_64.o := n
+UBSAN_SANITIZE_machine_kexec_64.o := n
+GCOV_PROFILE_machine_kexec_32.o := n
+UBSAN_SANITIZE_machine_kexec_32.o := n
+GCOV_PROFILE_kprobes.o := n
+UBSAN_SANITIZE_kprobes.o := n
+GCOV_PROFILE_kprobes-ftrace.o := n
+UBSAN_SANITIZE_kprobes-ftrace.o := n
+UBSAN_SANITIZE_vdso.o := n
+
+extra-$(CONFIG_PPC_FPU)		+= fpu.o
+extra-$(CONFIG_ALTIVEC)		+= vector.o
+extra-$(CONFIG_PPC64)		+= entry_64.o
+extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init.o
+
+extra-y				+= systbl_chk.i
+$(obj)/systbl.o:		systbl_chk
+
+quiet_cmd_systbl_chk = CALL    $<
+      cmd_systbl_chk = $(CONFIG_SHELL) $< $(obj)/systbl_chk.i
+
+PHONY += systbl_chk
+systbl_chk: $(src)/systbl_chk.sh $(obj)/systbl_chk.i
+	$(call cmd,systbl_chk)
+
+ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
+$(obj)/built-in.a:		prom_init_check
+
+quiet_cmd_prom_init_check = CALL    $<
+      cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o"
+
+PHONY += prom_init_check
+prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o
+	$(call cmd,prom_init_check)
+endif
+
+clean-files := vmlinux.lds
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
new file mode 100644
index 000000000..11550a3d1
--- /dev/null
+++ b/arch/powerpc/kernel/align.c
@@ -0,0 +1,358 @@
+/* align.c - handle alignment exceptions for the Power PC.
+ *
+ * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ * Copyright (c) 1998-1999 TiVo, Inc.
+ *   PowerPC 403GCX modifications.
+ * Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu>
+ *   PowerPC 403GCX/405GP modifications.
+ * Copyright (c) 2001-2002 PPC64 team, IBM Corp
+ *   64-bit and Power4 support
+ * Copyright (c) 2005 Benjamin Herrenschmidt, IBM Corp
+ *                    <benh@kernel.crashing.org>
+ *   Merge ppc32 and ppc64 implementations
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+#include <asm/cache.h>
+#include <asm/cputable.h>
+#include <asm/emulated_ops.h>
+#include <asm/switch_to.h>
+#include <asm/disassemble.h>
+#include <asm/cpu_has_feature.h>
+#include <asm/sstep.h>
+
+struct aligninfo {
+	unsigned char len;
+	unsigned char flags;
+};
+
+
+#define INVALID	{ 0, 0 }
+
+/* Bits in the flags field */
+#define LD	0	/* load */
+#define ST	1	/* store */
+#define SE	2	/* sign-extend value, or FP ld/st as word */
+#define SW	0x20	/* byte swap */
+#define E4	0x40	/* SPE endianness is word */
+#define E8	0x80	/* SPE endianness is double word */
+
+#ifdef CONFIG_SPE
+
+static struct aligninfo spe_aligninfo[32] = {
+	{ 8, LD+E8 },		/* 0 00 00: evldd[x] */
+	{ 8, LD+E4 },		/* 0 00 01: evldw[x] */
+	{ 8, LD },		/* 0 00 10: evldh[x] */
+	INVALID,		/* 0 00 11 */
+	{ 2, LD },		/* 0 01 00: evlhhesplat[x] */
+	INVALID,		/* 0 01 01 */
+	{ 2, LD },		/* 0 01 10: evlhhousplat[x] */
+	{ 2, LD+SE },		/* 0 01 11: evlhhossplat[x] */
+	{ 4, LD },		/* 0 10 00: evlwhe[x] */
+	INVALID,		/* 0 10 01 */
+	{ 4, LD },		/* 0 10 10: evlwhou[x] */
+	{ 4, LD+SE },		/* 0 10 11: evlwhos[x] */
+	{ 4, LD+E4 },		/* 0 11 00: evlwwsplat[x] */
+	INVALID,		/* 0 11 01 */
+	{ 4, LD },		/* 0 11 10: evlwhsplat[x] */
+	INVALID,		/* 0 11 11 */
+
+	{ 8, ST+E8 },		/* 1 00 00: evstdd[x] */
+	{ 8, ST+E4 },		/* 1 00 01: evstdw[x] */
+	{ 8, ST },		/* 1 00 10: evstdh[x] */
+	INVALID,		/* 1 00 11 */
+	INVALID,		/* 1 01 00 */
+	INVALID,		/* 1 01 01 */
+	INVALID,		/* 1 01 10 */
+	INVALID,		/* 1 01 11 */
+	{ 4, ST },		/* 1 10 00: evstwhe[x] */
+	INVALID,		/* 1 10 01 */
+	{ 4, ST },		/* 1 10 10: evstwho[x] */
+	INVALID,		/* 1 10 11 */
+	{ 4, ST+E4 },		/* 1 11 00: evstwwe[x] */
+	INVALID,		/* 1 11 01 */
+	{ 4, ST+E4 },		/* 1 11 10: evstwwo[x] */
+	INVALID,		/* 1 11 11 */
+};
+
+#define	EVLDD		0x00
+#define	EVLDW		0x01
+#define	EVLDH		0x02
+#define	EVLHHESPLAT	0x04
+#define	EVLHHOUSPLAT	0x06
+#define	EVLHHOSSPLAT	0x07
+#define	EVLWHE		0x08
+#define	EVLWHOU		0x0A
+#define	EVLWHOS		0x0B
+#define	EVLWWSPLAT	0x0C
+#define	EVLWHSPLAT	0x0E
+#define	EVSTDD		0x10
+#define	EVSTDW		0x11
+#define	EVSTDH		0x12
+#define	EVSTWHE		0x18
+#define	EVSTWHO		0x1A
+#define	EVSTWWE		0x1C
+#define	EVSTWWO		0x1E
+
+/*
+ * Emulate SPE loads and stores.
+ * Only Book-E has these instructions, and it does true little-endian,
+ * so we don't need the address swizzling.
+ */
+static int emulate_spe(struct pt_regs *regs, unsigned int reg,
+		       unsigned int instr)
+{
+	int ret;
+	union {
+		u64 ll;
+		u32 w[2];
+		u16 h[4];
+		u8 v[8];
+	} data, temp;
+	unsigned char __user *p, *addr;
+	unsigned long *evr = &current->thread.evr[reg];
+	unsigned int nb, flags;
+
+	instr = (instr >> 1) & 0x1f;
+
+	/* DAR has the operand effective address */
+	addr = (unsigned char __user *)regs->dar;
+
+	nb = spe_aligninfo[instr].len;
+	flags = spe_aligninfo[instr].flags;
+
+	/* Verify the address of the operand */
+	if (unlikely(user_mode(regs) &&
+		     !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ),
+				addr, nb)))
+		return -EFAULT;
+
+	/* userland only */
+	if (unlikely(!user_mode(regs)))
+		return 0;
+
+	flush_spe_to_thread(current);
+
+	/* If we are loading, get the data from user space, else
+	 * get it from register values
+	 */
+	if (flags & ST) {
+		data.ll = 0;
+		switch (instr) {
+		case EVSTDD:
+		case EVSTDW:
+		case EVSTDH:
+			data.w[0] = *evr;
+			data.w[1] = regs->gpr[reg];
+			break;
+		case EVSTWHE:
+			data.h[2] = *evr >> 16;
+			data.h[3] = regs->gpr[reg] >> 16;
+			break;
+		case EVSTWHO:
+			data.h[2] = *evr & 0xffff;
+			data.h[3] = regs->gpr[reg] & 0xffff;
+			break;
+		case EVSTWWE:
+			data.w[1] = *evr;
+			break;
+		case EVSTWWO:
+			data.w[1] = regs->gpr[reg];
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		temp.ll = data.ll = 0;
+		ret = 0;
+		p = addr;
+
+		switch (nb) {
+		case 8:
+			ret |= __get_user_inatomic(temp.v[0], p++);
+			ret |= __get_user_inatomic(temp.v[1], p++);
+			ret |= __get_user_inatomic(temp.v[2], p++);
+			ret |= __get_user_inatomic(temp.v[3], p++);
+		case 4:
+			ret |= __get_user_inatomic(temp.v[4], p++);
+			ret |= __get_user_inatomic(temp.v[5], p++);
+		case 2:
+			ret |= __get_user_inatomic(temp.v[6], p++);
+			ret |= __get_user_inatomic(temp.v[7], p++);
+			if (unlikely(ret))
+				return -EFAULT;
+		}
+
+		switch (instr) {
+		case EVLDD:
+		case EVLDW:
+		case EVLDH:
+			data.ll = temp.ll;
+			break;
+		case EVLHHESPLAT:
+			data.h[0] = temp.h[3];
+			data.h[2] = temp.h[3];
+			break;
+		case EVLHHOUSPLAT:
+		case EVLHHOSSPLAT:
+			data.h[1] = temp.h[3];
+			data.h[3] = temp.h[3];
+			break;
+		case EVLWHE:
+			data.h[0] = temp.h[2];
+			data.h[2] = temp.h[3];
+			break;
+		case EVLWHOU:
+		case EVLWHOS:
+			data.h[1] = temp.h[2];
+			data.h[3] = temp.h[3];
+			break;
+		case EVLWWSPLAT:
+			data.w[0] = temp.w[1];
+			data.w[1] = temp.w[1];
+			break;
+		case EVLWHSPLAT:
+			data.h[0] = temp.h[2];
+			data.h[1] = temp.h[2];
+			data.h[2] = temp.h[3];
+			data.h[3] = temp.h[3];
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	if (flags & SW) {
+		switch (flags & 0xf0) {
+		case E8:
+			data.ll = swab64(data.ll);
+			break;
+		case E4:
+			data.w[0] = swab32(data.w[0]);
+			data.w[1] = swab32(data.w[1]);
+			break;
+		/* Its half word endian */
+		default:
+			data.h[0] = swab16(data.h[0]);
+			data.h[1] = swab16(data.h[1]);
+			data.h[2] = swab16(data.h[2]);
+			data.h[3] = swab16(data.h[3]);
+			break;
+		}
+	}
+
+	if (flags & SE) {
+		data.w[0] = (s16)data.h[1];
+		data.w[1] = (s16)data.h[3];
+	}
+
+	/* Store result to memory or update registers */
+	if (flags & ST) {
+		ret = 0;
+		p = addr;
+		switch (nb) {
+		case 8:
+			ret |= __put_user_inatomic(data.v[0], p++);
+			ret |= __put_user_inatomic(data.v[1], p++);
+			ret |= __put_user_inatomic(data.v[2], p++);
+			ret |= __put_user_inatomic(data.v[3], p++);
+		case 4:
+			ret |= __put_user_inatomic(data.v[4], p++);
+			ret |= __put_user_inatomic(data.v[5], p++);
+		case 2:
+			ret |= __put_user_inatomic(data.v[6], p++);
+			ret |= __put_user_inatomic(data.v[7], p++);
+		}
+		if (unlikely(ret))
+			return -EFAULT;
+	} else {
+		*evr = data.w[0];
+		regs->gpr[reg] = data.w[1];
+	}
+
+	return 1;
+}
+#endif /* CONFIG_SPE */
+
+/*
+ * Called on alignment exception. Attempts to fixup
+ *
+ * Return 1 on success
+ * Return 0 if unable to handle the interrupt
+ * Return -EFAULT if data address is bad
+ * Other negative return values indicate that the instruction can't
+ * be emulated, and the process should be given a SIGBUS.
+ */
+
+int fix_alignment(struct pt_regs *regs)
+{
+	unsigned int instr;
+	struct instruction_op op;
+	int r, type;
+
+	/*
+	 * We require a complete register set, if not, then our assembly
+	 * is broken
+	 */
+	CHECK_FULL_REGS(regs);
+
+	if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip)))
+		return -EFAULT;
+	if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
+		/* We don't handle PPC little-endian any more... */
+		if (cpu_has_feature(CPU_FTR_PPC_LE))
+			return -EIO;
+		instr = swab32(instr);
+	}
+
+#ifdef CONFIG_SPE
+	if ((instr >> 26) == 0x4) {
+		int reg = (instr >> 21) & 0x1f;
+		PPC_WARN_ALIGNMENT(spe, regs);
+		return emulate_spe(regs, reg, instr);
+	}
+#endif
+
+
+	/*
+	 * ISA 3.0 (such as P9) copy, copy_first, paste and paste_last alignment
+	 * check.
+	 *
+	 * Send a SIGBUS to the process that caused the fault.
+	 *
+	 * We do not emulate these because paste may contain additional metadata
+	 * when pasting to a co-processor. Furthermore, paste_last is the
+	 * synchronisation point for preceding copy/paste sequences.
+	 */
+	if ((instr & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe))
+		return -EIO;
+
+	r = analyse_instr(&op, regs, instr);
+	if (r < 0)
+		return -EINVAL;
+
+	type = GETTYPE(op.type);
+	if (!OP_IS_LOAD_STORE(type)) {
+		if (op.type != CACHEOP + DCBZ)
+			return -EINVAL;
+		PPC_WARN_ALIGNMENT(dcbz, regs);
+		r = emulate_dcbz(op.ea, regs);
+	} else {
+		if (type == LARX || type == STCX)
+			return -EIO;
+		PPC_WARN_ALIGNMENT(unaligned, regs);
+		r = emulate_loadstore(regs, &op);
+	}
+
+	if (!r)
+		return 1;
+	return r;
+}
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
new file mode 100644
index 000000000..50400f213
--- /dev/null
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -0,0 +1,789 @@
+/*
+ * This program is used to generate definitions needed by
+ * assembly language modules.
+ *
+ * We use the technique used in the OSF Mach kernel code:
+ * generate asm statements containing #defines,
+ * compile this file to assembler, and then extract the
+ * #defines from the assembly-language output.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/compat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/hrtimer.h>
+#ifdef CONFIG_PPC64
+#include <linux/time.h>
+#include <linux/hardirq.h>
+#endif
+#include <linux/kbuild.h>
+
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/rtas.h>
+#include <asm/vdso_datapage.h>
+#include <asm/dbell.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#include <asm/lppaca.h>
+#include <asm/cache.h>
+#include <asm/mmu.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#endif
+#ifdef CONFIG_PPC_POWERNV
+#include <asm/opal.h>
+#endif
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST)
+#include <linux/kvm_host.h>
+#endif
+#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#endif
+
+#ifdef CONFIG_PPC32
+#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+#include "head_booke.h"
+#endif
+#endif
+
+#if defined(CONFIG_PPC_FSL_BOOK3E)
+#include "../mm/mmu_decl.h"
+#endif
+
+#ifdef CONFIG_PPC_8xx
+#include <asm/fixmap.h>
+#endif
+
+#define STACK_PT_REGS_OFFSET(sym, val)	\
+	DEFINE(sym, STACK_FRAME_OVERHEAD + offsetof(struct pt_regs, val))
+
+int main(void)
+{
+	OFFSET(THREAD, task_struct, thread);
+	OFFSET(MM, task_struct, mm);
+	OFFSET(MMCONTEXTID, mm_struct, context.id);
+#ifdef CONFIG_PPC64
+	DEFINE(SIGSEGV, SIGSEGV);
+	DEFINE(NMI_MASK, NMI_MASK);
+	OFFSET(TASKTHREADPPR, task_struct, thread.ppr);
+#else
+	OFFSET(THREAD_INFO, task_struct, stack);
+	DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
+	OFFSET(KSP_LIMIT, thread_struct, ksp_limit);
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_LIVEPATCH
+	OFFSET(TI_livepatch_sp, thread_info, livepatch_sp);
+#endif
+
+	OFFSET(KSP, thread_struct, ksp);
+	OFFSET(PT_REGS, thread_struct, regs);
+#ifdef CONFIG_BOOKE
+	OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]);
+#endif
+	OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode);
+	OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr);
+	OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area);
+	OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr);
+	OFFSET(THREAD_LOAD_FP, thread_struct, load_fp);
+#ifdef CONFIG_ALTIVEC
+	OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr);
+	OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area);
+	OFFSET(THREAD_VRSAVE, thread_struct, vrsave);
+	OFFSET(THREAD_USED_VR, thread_struct, used_vr);
+	OFFSET(VRSTATE_VSCR, thread_vr_state, vscr);
+	OFFSET(THREAD_LOAD_VEC, thread_struct, load_vec);
+#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_VSX
+	OFFSET(THREAD_USED_VSR, thread_struct, used_vsr);
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_PPC64
+	OFFSET(KSP_VSID, thread_struct, ksp_vsid);
+#else /* CONFIG_PPC64 */
+	OFFSET(PGDIR, thread_struct, pgdir);
+#ifdef CONFIG_SPE
+	OFFSET(THREAD_EVR0, thread_struct, evr[0]);
+	OFFSET(THREAD_ACC, thread_struct, acc);
+	OFFSET(THREAD_SPEFSCR, thread_struct, spefscr);
+	OFFSET(THREAD_USED_SPE, thread_struct, used_spe);
+#endif /* CONFIG_SPE */
+#endif /* CONFIG_PPC64 */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+	OFFSET(THREAD_DBCR0, thread_struct, debug.dbcr0);
+#endif
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	OFFSET(THREAD_KVM_SVCPU, thread_struct, kvm_shadow_vcpu);
+#endif
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	OFFSET(THREAD_KVM_VCPU, thread_struct, kvm_vcpu);
+#endif
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	OFFSET(PACATMSCRATCH, paca_struct, tm_scratch);
+	OFFSET(THREAD_TM_TFHAR, thread_struct, tm_tfhar);
+	OFFSET(THREAD_TM_TEXASR, thread_struct, tm_texasr);
+	OFFSET(THREAD_TM_TFIAR, thread_struct, tm_tfiar);
+	OFFSET(THREAD_TM_TAR, thread_struct, tm_tar);
+	OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr);
+	OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr);
+	OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs);
+	OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr);
+	OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave);
+	OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr);
+	/* Local pt_regs on stack for Transactional Memory funcs. */
+	DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
+	       sizeof(struct pt_regs) + 16);
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+	OFFSET(TI_FLAGS, thread_info, flags);
+	OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
+	OFFSET(TI_PREEMPT, thread_info, preempt_count);
+	OFFSET(TI_TASK, thread_info, task);
+	OFFSET(TI_CPU, thread_info, cpu);
+
+#ifdef CONFIG_PPC64
+	OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size);
+	OFFSET(DCACHEL1LOGBLOCKSIZE, ppc64_caches, l1d.log_block_size);
+	OFFSET(DCACHEL1BLOCKSPERPAGE, ppc64_caches, l1d.blocks_per_page);
+	OFFSET(ICACHEL1BLOCKSIZE, ppc64_caches, l1i.block_size);
+	OFFSET(ICACHEL1LOGBLOCKSIZE, ppc64_caches, l1i.log_block_size);
+	OFFSET(ICACHEL1BLOCKSPERPAGE, ppc64_caches, l1i.blocks_per_page);
+	/* paca */
+	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+	OFFSET(PACAPACAINDEX, paca_struct, paca_index);
+	OFFSET(PACAPROCSTART, paca_struct, cpu_start);
+	OFFSET(PACAKSAVE, paca_struct, kstack);
+	OFFSET(PACACURRENT, paca_struct, __current);
+	OFFSET(PACASAVEDMSR, paca_struct, saved_msr);
+	OFFSET(PACASTABRR, paca_struct, stab_rr);
+	OFFSET(PACAR1, paca_struct, saved_r1);
+	OFFSET(PACATOC, paca_struct, kernel_toc);
+	OFFSET(PACAKBASE, paca_struct, kernelbase);
+	OFFSET(PACAKMSR, paca_struct, kernel_msr);
+	OFFSET(PACAIRQSOFTMASK, paca_struct, irq_soft_mask);
+	OFFSET(PACAIRQHAPPENED, paca_struct, irq_happened);
+	OFFSET(PACA_FTRACE_ENABLED, paca_struct, ftrace_enabled);
+#ifdef CONFIG_PPC_BOOK3S
+	OFFSET(PACACONTEXTID, paca_struct, mm_ctx_id);
+#ifdef CONFIG_PPC_MM_SLICES
+	OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
+	OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
+	OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit);
+	DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
+#endif /* CONFIG_PPC_MM_SLICES */
+#endif
+
+#ifdef CONFIG_PPC_BOOK3E
+	OFFSET(PACAPGD, paca_struct, pgd);
+	OFFSET(PACA_KERNELPGD, paca_struct, kernel_pgd);
+	OFFSET(PACA_EXGEN, paca_struct, exgen);
+	OFFSET(PACA_EXTLB, paca_struct, extlb);
+	OFFSET(PACA_EXMC, paca_struct, exmc);
+	OFFSET(PACA_EXCRIT, paca_struct, excrit);
+	OFFSET(PACA_EXDBG, paca_struct, exdbg);
+	OFFSET(PACA_MC_STACK, paca_struct, mc_kstack);
+	OFFSET(PACA_CRIT_STACK, paca_struct, crit_kstack);
+	OFFSET(PACA_DBG_STACK, paca_struct, dbg_kstack);
+	OFFSET(PACA_TCD_PTR, paca_struct, tcd_ptr);
+
+	OFFSET(TCD_ESEL_NEXT, tlb_core_data, esel_next);
+	OFFSET(TCD_ESEL_MAX, tlb_core_data, esel_max);
+	OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first);
+#endif /* CONFIG_PPC_BOOK3E */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	OFFSET(PACASLBCACHE, paca_struct, slb_cache);
+	OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr);
+	OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp);
+#ifdef CONFIG_PPC_MM_SLICES
+	OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp);
+#else
+	OFFSET(PACACONTEXTSLLP, paca_struct, mm_ctx_sllp);
+#endif /* CONFIG_PPC_MM_SLICES */
+	OFFSET(PACA_EXGEN, paca_struct, exgen);
+	OFFSET(PACA_EXMC, paca_struct, exmc);
+	OFFSET(PACA_EXSLB, paca_struct, exslb);
+	OFFSET(PACA_EXNMI, paca_struct, exnmi);
+#ifdef CONFIG_PPC_PSERIES
+	OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
+#endif
+	OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr);
+	OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid);
+	OFFSET(SLBSHADOW_STACKESID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid);
+	OFFSET(SLBSHADOW_SAVEAREA, slb_shadow, save_area);
+	OFFSET(LPPACA_PMCINUSE, lppaca, pmcregs_in_use);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	OFFSET(PACA_PMCINUSE, paca_struct, pmcregs_in_use);
+#endif
+	OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx);
+	OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count);
+	OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+	OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
+#ifdef CONFIG_PPC_BOOK3S_64
+	OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
+	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
+	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
+	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
+	OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
+	OFFSET(PACA_EXRFI, paca_struct, exrfi);
+	OFFSET(PACA_L1D_FLUSH_SIZE, paca_struct, l1d_flush_size);
+
+#endif
+	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
+	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
+	OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default);
+	OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime);
+	OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user);
+	OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
+	OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime);
+	OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save);
+	OFFSET(PACA_NAPSTATELOST, paca_struct, nap_state_lost);
+	OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso);
+#else /* CONFIG_PPC64 */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	OFFSET(ACCOUNT_STARTTIME, thread_info, accounting.starttime);
+	OFFSET(ACCOUNT_STARTTIME_USER, thread_info, accounting.starttime_user);
+	OFFSET(ACCOUNT_USER_TIME, thread_info, accounting.utime);
+	OFFSET(ACCOUNT_SYSTEM_TIME, thread_info, accounting.stime);
+#endif
+#endif /* CONFIG_PPC64 */
+
+	/* RTAS */
+	OFFSET(RTASBASE, rtas_t, base);
+	OFFSET(RTASENTRY, rtas_t, entry);
+
+	/* Interrupt register frame */
+	DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE);
+	DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs));
+#ifdef CONFIG_PPC64
+	/* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */
+	DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16);
+	DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16);
+#endif /* CONFIG_PPC64 */
+	STACK_PT_REGS_OFFSET(GPR0, gpr[0]);
+	STACK_PT_REGS_OFFSET(GPR1, gpr[1]);
+	STACK_PT_REGS_OFFSET(GPR2, gpr[2]);
+	STACK_PT_REGS_OFFSET(GPR3, gpr[3]);
+	STACK_PT_REGS_OFFSET(GPR4, gpr[4]);
+	STACK_PT_REGS_OFFSET(GPR5, gpr[5]);
+	STACK_PT_REGS_OFFSET(GPR6, gpr[6]);
+	STACK_PT_REGS_OFFSET(GPR7, gpr[7]);
+	STACK_PT_REGS_OFFSET(GPR8, gpr[8]);
+	STACK_PT_REGS_OFFSET(GPR9, gpr[9]);
+	STACK_PT_REGS_OFFSET(GPR10, gpr[10]);
+	STACK_PT_REGS_OFFSET(GPR11, gpr[11]);
+	STACK_PT_REGS_OFFSET(GPR12, gpr[12]);
+	STACK_PT_REGS_OFFSET(GPR13, gpr[13]);
+#ifndef CONFIG_PPC64
+	STACK_PT_REGS_OFFSET(GPR14, gpr[14]);
+#endif /* CONFIG_PPC64 */
+	/*
+	 * Note: these symbols include _ because they overlap with special
+	 * register names
+	 */
+	STACK_PT_REGS_OFFSET(_NIP, nip);
+	STACK_PT_REGS_OFFSET(_MSR, msr);
+	STACK_PT_REGS_OFFSET(_CTR, ctr);
+	STACK_PT_REGS_OFFSET(_LINK, link);
+	STACK_PT_REGS_OFFSET(_CCR, ccr);
+	STACK_PT_REGS_OFFSET(_XER, xer);
+	STACK_PT_REGS_OFFSET(_DAR, dar);
+	STACK_PT_REGS_OFFSET(_DSISR, dsisr);
+	STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3);
+	STACK_PT_REGS_OFFSET(RESULT, result);
+	STACK_PT_REGS_OFFSET(_TRAP, trap);
+#ifndef CONFIG_PPC64
+	/*
+	 * The PowerPC 400-class & Book-E processors have neither the DAR
+	 * nor the DSISR SPRs. Hence, we overload them to hold the similar
+	 * DEAR and ESR SPRs for such processors.  For critical interrupts
+	 * we use them to hold SRR0 and SRR1.
+	 */
+	STACK_PT_REGS_OFFSET(_DEAR, dar);
+	STACK_PT_REGS_OFFSET(_ESR, dsisr);
+#else /* CONFIG_PPC64 */
+	STACK_PT_REGS_OFFSET(SOFTE, softe);
+
+	/* These _only_ to be used with {PROM,RTAS}_FRAME_SIZE!!! */
+	DEFINE(_SRR0, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs));
+	DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8);
+#endif /* CONFIG_PPC64 */
+
+#if defined(CONFIG_PPC32)
+#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+	DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE);
+	DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0));
+	/* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */
+	DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0));
+	DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1));
+	DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2));
+	DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3));
+	DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6));
+	DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7));
+	DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0));
+	DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1));
+	DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0));
+	DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1));
+	DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0));
+	DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1));
+	DEFINE(SAVED_KSP_LIMIT, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, saved_ksp_limit));
+#endif
+#endif
+
+#ifndef CONFIG_PPC64
+	OFFSET(MM_PGD, mm_struct, pgd);
+#endif /* ! CONFIG_PPC64 */
+
+	/* About the CPU features table */
+	OFFSET(CPU_SPEC_FEATURES, cpu_spec, cpu_features);
+	OFFSET(CPU_SPEC_SETUP, cpu_spec, cpu_setup);
+	OFFSET(CPU_SPEC_RESTORE, cpu_spec, cpu_restore);
+
+	OFFSET(pbe_address, pbe, address);
+	OFFSET(pbe_orig_address, pbe, orig_address);
+	OFFSET(pbe_next, pbe, next);
+
+#ifndef CONFIG_PPC64
+	DEFINE(TASK_SIZE, TASK_SIZE);
+	DEFINE(NUM_USER_SEGMENTS, TASK_SIZE>>28);
+#endif /* ! CONFIG_PPC64 */
+
+	/* datapage offsets for use by vdso */
+	OFFSET(CFG_TB_ORIG_STAMP, vdso_data, tb_orig_stamp);
+	OFFSET(CFG_TB_TICKS_PER_SEC, vdso_data, tb_ticks_per_sec);
+	OFFSET(CFG_TB_TO_XS, vdso_data, tb_to_xs);
+	OFFSET(CFG_TB_UPDATE_COUNT, vdso_data, tb_update_count);
+	OFFSET(CFG_TZ_MINUTEWEST, vdso_data, tz_minuteswest);
+	OFFSET(CFG_TZ_DSTTIME, vdso_data, tz_dsttime);
+	OFFSET(CFG_SYSCALL_MAP32, vdso_data, syscall_map_32);
+	OFFSET(WTOM_CLOCK_SEC, vdso_data, wtom_clock_sec);
+	OFFSET(WTOM_CLOCK_NSEC, vdso_data, wtom_clock_nsec);
+	OFFSET(STAMP_XTIME, vdso_data, stamp_xtime);
+	OFFSET(STAMP_SEC_FRAC, vdso_data, stamp_sec_fraction);
+	OFFSET(CLOCK_HRTIMER_RES, vdso_data, hrtimer_res);
+	OFFSET(CFG_ICACHE_BLOCKSZ, vdso_data, icache_block_size);
+	OFFSET(CFG_DCACHE_BLOCKSZ, vdso_data, dcache_block_size);
+	OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_data, icache_log_block_size);
+	OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_data, dcache_log_block_size);
+#ifdef CONFIG_PPC64
+	OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64);
+	OFFSET(TVAL64_TV_SEC, timeval, tv_sec);
+	OFFSET(TVAL64_TV_USEC, timeval, tv_usec);
+	OFFSET(TVAL32_TV_SEC, compat_timeval, tv_sec);
+	OFFSET(TVAL32_TV_USEC, compat_timeval, tv_usec);
+	OFFSET(TSPC64_TV_SEC, timespec, tv_sec);
+	OFFSET(TSPC64_TV_NSEC, timespec, tv_nsec);
+	OFFSET(TSPC32_TV_SEC, compat_timespec, tv_sec);
+	OFFSET(TSPC32_TV_NSEC, compat_timespec, tv_nsec);
+#else
+	OFFSET(TVAL32_TV_SEC, timeval, tv_sec);
+	OFFSET(TVAL32_TV_USEC, timeval, tv_usec);
+	OFFSET(TSPC32_TV_SEC, timespec, tv_sec);
+	OFFSET(TSPC32_TV_NSEC, timespec, tv_nsec);
+#endif
+	/* timeval/timezone offsets for use by vdso */
+	OFFSET(TZONE_TZ_MINWEST, timezone, tz_minuteswest);
+	OFFSET(TZONE_TZ_DSTTIME, timezone, tz_dsttime);
+
+	/* Other bits used by the vdso */
+	DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
+	DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
+	DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
+	DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
+	DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
+
+#ifdef CONFIG_BUG
+	DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE)));
+#else
+	DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
+#endif
+	DEFINE(PTE_SIZE, sizeof(pte_t));
+
+#ifdef CONFIG_KVM
+	OFFSET(VCPU_HOST_STACK, kvm_vcpu, arch.host_stack);
+	OFFSET(VCPU_HOST_PID, kvm_vcpu, arch.host_pid);
+	OFFSET(VCPU_GUEST_PID, kvm_vcpu, arch.pid);
+	OFFSET(VCPU_GPRS, kvm_vcpu, arch.regs.gpr);
+	OFFSET(VCPU_VRSAVE, kvm_vcpu, arch.vrsave);
+	OFFSET(VCPU_FPRS, kvm_vcpu, arch.fp.fpr);
+#ifdef CONFIG_ALTIVEC
+	OFFSET(VCPU_VRS, kvm_vcpu, arch.vr.vr);
+#endif
+	OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
+	OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
+	OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
+#ifdef CONFIG_PPC_BOOK3S
+	OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
+#endif
+	OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
+	OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
+	OFFSET(VCPU_SRR0, kvm_vcpu, arch.shregs.srr0);
+	OFFSET(VCPU_SRR1, kvm_vcpu, arch.shregs.srr1);
+	OFFSET(VCPU_SPRG0, kvm_vcpu, arch.shregs.sprg0);
+	OFFSET(VCPU_SPRG1, kvm_vcpu, arch.shregs.sprg1);
+	OFFSET(VCPU_SPRG2, kvm_vcpu, arch.shregs.sprg2);
+	OFFSET(VCPU_SPRG3, kvm_vcpu, arch.shregs.sprg3);
+#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	OFFSET(VCPU_TB_RMENTRY, kvm_vcpu, arch.rm_entry);
+	OFFSET(VCPU_TB_RMINTR, kvm_vcpu, arch.rm_intr);
+	OFFSET(VCPU_TB_RMEXIT, kvm_vcpu, arch.rm_exit);
+	OFFSET(VCPU_TB_GUEST, kvm_vcpu, arch.guest_time);
+	OFFSET(VCPU_TB_CEDE, kvm_vcpu, arch.cede_time);
+	OFFSET(VCPU_CUR_ACTIVITY, kvm_vcpu, arch.cur_activity);
+	OFFSET(VCPU_ACTIVITY_START, kvm_vcpu, arch.cur_tb_start);
+	OFFSET(TAS_SEQCOUNT, kvmhv_tb_accumulator, seqcount);
+	OFFSET(TAS_TOTAL, kvmhv_tb_accumulator, tb_total);
+	OFFSET(TAS_MIN, kvmhv_tb_accumulator, tb_min);
+	OFFSET(TAS_MAX, kvmhv_tb_accumulator, tb_max);
+#endif
+	OFFSET(VCPU_SHARED_SPRG3, kvm_vcpu_arch_shared, sprg3);
+	OFFSET(VCPU_SHARED_SPRG4, kvm_vcpu_arch_shared, sprg4);
+	OFFSET(VCPU_SHARED_SPRG5, kvm_vcpu_arch_shared, sprg5);
+	OFFSET(VCPU_SHARED_SPRG6, kvm_vcpu_arch_shared, sprg6);
+	OFFSET(VCPU_SHARED_SPRG7, kvm_vcpu_arch_shared, sprg7);
+	OFFSET(VCPU_SHADOW_PID, kvm_vcpu, arch.shadow_pid);
+	OFFSET(VCPU_SHADOW_PID1, kvm_vcpu, arch.shadow_pid1);
+	OFFSET(VCPU_SHARED, kvm_vcpu, arch.shared);
+	OFFSET(VCPU_SHARED_MSR, kvm_vcpu_arch_shared, msr);
+	OFFSET(VCPU_SHADOW_MSR, kvm_vcpu, arch.shadow_msr);
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+	OFFSET(VCPU_SHAREDBE, kvm_vcpu, arch.shared_big_endian);
+#endif
+
+	OFFSET(VCPU_SHARED_MAS0, kvm_vcpu_arch_shared, mas0);
+	OFFSET(VCPU_SHARED_MAS1, kvm_vcpu_arch_shared, mas1);
+	OFFSET(VCPU_SHARED_MAS2, kvm_vcpu_arch_shared, mas2);
+	OFFSET(VCPU_SHARED_MAS7_3, kvm_vcpu_arch_shared, mas7_3);
+	OFFSET(VCPU_SHARED_MAS4, kvm_vcpu_arch_shared, mas4);
+	OFFSET(VCPU_SHARED_MAS6, kvm_vcpu_arch_shared, mas6);
+
+	OFFSET(VCPU_KVM, kvm_vcpu, kvm);
+	OFFSET(KVM_LPID, kvm, arch.lpid);
+
+	/* book3s */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	OFFSET(KVM_TLB_SETS, kvm, arch.tlb_sets);
+	OFFSET(KVM_SDR1, kvm, arch.sdr1);
+	OFFSET(KVM_HOST_LPID, kvm, arch.host_lpid);
+	OFFSET(KVM_HOST_LPCR, kvm, arch.host_lpcr);
+	OFFSET(KVM_HOST_SDR1, kvm, arch.host_sdr1);
+	OFFSET(KVM_NEED_FLUSH, kvm, arch.need_tlb_flush.bits);
+	OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
+	OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
+	OFFSET(KVM_RADIX, kvm, arch.radix);
+	OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
+	OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
+	OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
+	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
+	OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
+	OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
+	OFFSET(VCPU_CPU, kvm_vcpu, cpu);
+	OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
+#endif
+#ifdef CONFIG_PPC_BOOK3S
+	OFFSET(VCPU_PURR, kvm_vcpu, arch.purr);
+	OFFSET(VCPU_SPURR, kvm_vcpu, arch.spurr);
+	OFFSET(VCPU_IC, kvm_vcpu, arch.ic);
+	OFFSET(VCPU_DSCR, kvm_vcpu, arch.dscr);
+	OFFSET(VCPU_AMR, kvm_vcpu, arch.amr);
+	OFFSET(VCPU_UAMOR, kvm_vcpu, arch.uamor);
+	OFFSET(VCPU_IAMR, kvm_vcpu, arch.iamr);
+	OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl);
+	OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr);
+	OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx);
+	OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr);
+	OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx);
+	OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr);
+	OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags);
+	OFFSET(VCPU_DEC, kvm_vcpu, arch.dec);
+	OFFSET(VCPU_DEC_EXPIRES, kvm_vcpu, arch.dec_expires);
+	OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
+	OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
+	OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
+	OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending);
+	OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
+	OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
+	OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
+	OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc);
+	OFFSET(VCPU_SIAR, kvm_vcpu, arch.siar);
+	OFFSET(VCPU_SDAR, kvm_vcpu, arch.sdar);
+	OFFSET(VCPU_SIER, kvm_vcpu, arch.sier);
+	OFFSET(VCPU_SLB, kvm_vcpu, arch.slb);
+	OFFSET(VCPU_SLB_MAX, kvm_vcpu, arch.slb_max);
+	OFFSET(VCPU_SLB_NR, kvm_vcpu, arch.slb_nr);
+	OFFSET(VCPU_FAULT_DSISR, kvm_vcpu, arch.fault_dsisr);
+	OFFSET(VCPU_FAULT_DAR, kvm_vcpu, arch.fault_dar);
+	OFFSET(VCPU_FAULT_GPA, kvm_vcpu, arch.fault_gpa);
+	OFFSET(VCPU_INTR_MSR, kvm_vcpu, arch.intr_msr);
+	OFFSET(VCPU_LAST_INST, kvm_vcpu, arch.last_inst);
+	OFFSET(VCPU_TRAP, kvm_vcpu, arch.trap);
+	OFFSET(VCPU_CFAR, kvm_vcpu, arch.cfar);
+	OFFSET(VCPU_PPR, kvm_vcpu, arch.ppr);
+	OFFSET(VCPU_FSCR, kvm_vcpu, arch.fscr);
+	OFFSET(VCPU_PSPB, kvm_vcpu, arch.pspb);
+	OFFSET(VCPU_EBBHR, kvm_vcpu, arch.ebbhr);
+	OFFSET(VCPU_EBBRR, kvm_vcpu, arch.ebbrr);
+	OFFSET(VCPU_BESCR, kvm_vcpu, arch.bescr);
+	OFFSET(VCPU_CSIGR, kvm_vcpu, arch.csigr);
+	OFFSET(VCPU_TACR, kvm_vcpu, arch.tacr);
+	OFFSET(VCPU_TCSCR, kvm_vcpu, arch.tcscr);
+	OFFSET(VCPU_ACOP, kvm_vcpu, arch.acop);
+	OFFSET(VCPU_WORT, kvm_vcpu, arch.wort);
+	OFFSET(VCPU_TID, kvm_vcpu, arch.tid);
+	OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr);
+	OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr);
+	OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map);
+	OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest);
+	OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
+	OFFSET(VCORE_KVM, kvmppc_vcore, kvm);
+	OFFSET(VCORE_TB_OFFSET, kvmppc_vcore, tb_offset);
+	OFFSET(VCORE_TB_OFFSET_APPL, kvmppc_vcore, tb_offset_applied);
+	OFFSET(VCORE_LPCR, kvmppc_vcore, lpcr);
+	OFFSET(VCORE_PCR, kvmppc_vcore, pcr);
+	OFFSET(VCORE_DPDES, kvmppc_vcore, dpdes);
+	OFFSET(VCORE_VTB, kvmppc_vcore, vtb);
+	OFFSET(VCPU_SLB_E, kvmppc_slb, orige);
+	OFFSET(VCPU_SLB_V, kvmppc_slb, origv);
+	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	OFFSET(VCPU_TFHAR, kvm_vcpu, arch.tfhar);
+	OFFSET(VCPU_TFIAR, kvm_vcpu, arch.tfiar);
+	OFFSET(VCPU_TEXASR, kvm_vcpu, arch.texasr);
+	OFFSET(VCPU_ORIG_TEXASR, kvm_vcpu, arch.orig_texasr);
+	OFFSET(VCPU_GPR_TM, kvm_vcpu, arch.gpr_tm);
+	OFFSET(VCPU_FPRS_TM, kvm_vcpu, arch.fp_tm.fpr);
+	OFFSET(VCPU_VRS_TM, kvm_vcpu, arch.vr_tm.vr);
+	OFFSET(VCPU_VRSAVE_TM, kvm_vcpu, arch.vrsave_tm);
+	OFFSET(VCPU_CR_TM, kvm_vcpu, arch.cr_tm);
+	OFFSET(VCPU_XER_TM, kvm_vcpu, arch.xer_tm);
+	OFFSET(VCPU_LR_TM, kvm_vcpu, arch.lr_tm);
+	OFFSET(VCPU_CTR_TM, kvm_vcpu, arch.ctr_tm);
+	OFFSET(VCPU_AMR_TM, kvm_vcpu, arch.amr_tm);
+	OFFSET(VCPU_PPR_TM, kvm_vcpu, arch.ppr_tm);
+	OFFSET(VCPU_DSCR_TM, kvm_vcpu, arch.dscr_tm);
+	OFFSET(VCPU_TAR_TM, kvm_vcpu, arch.tar_tm);
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	OFFSET(PACA_SVCPU, paca_struct, shadow_vcpu);
+# define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+#else
+# define SVCPU_FIELD(x, f)
+#endif
+# define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
+#else	/* 32-bit */
+# define SVCPU_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
+# define HSTATE_FIELD(x, f)	DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
+#endif
+
+	SVCPU_FIELD(SVCPU_CR, cr);
+	SVCPU_FIELD(SVCPU_XER, xer);
+	SVCPU_FIELD(SVCPU_CTR, ctr);
+	SVCPU_FIELD(SVCPU_LR, lr);
+	SVCPU_FIELD(SVCPU_PC, pc);
+	SVCPU_FIELD(SVCPU_R0, gpr[0]);
+	SVCPU_FIELD(SVCPU_R1, gpr[1]);
+	SVCPU_FIELD(SVCPU_R2, gpr[2]);
+	SVCPU_FIELD(SVCPU_R3, gpr[3]);
+	SVCPU_FIELD(SVCPU_R4, gpr[4]);
+	SVCPU_FIELD(SVCPU_R5, gpr[5]);
+	SVCPU_FIELD(SVCPU_R6, gpr[6]);
+	SVCPU_FIELD(SVCPU_R7, gpr[7]);
+	SVCPU_FIELD(SVCPU_R8, gpr[8]);
+	SVCPU_FIELD(SVCPU_R9, gpr[9]);
+	SVCPU_FIELD(SVCPU_R10, gpr[10]);
+	SVCPU_FIELD(SVCPU_R11, gpr[11]);
+	SVCPU_FIELD(SVCPU_R12, gpr[12]);
+	SVCPU_FIELD(SVCPU_R13, gpr[13]);
+	SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
+	SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
+	SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
+	SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
+#ifdef CONFIG_PPC_BOOK3S_32
+	SVCPU_FIELD(SVCPU_SR, sr);
+#endif
+#ifdef CONFIG_PPC64
+	SVCPU_FIELD(SVCPU_SLB, slb);
+	SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+	SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr);
+#endif
+
+	HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
+	HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+	HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
+	HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
+	HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
+	HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+	HSTATE_FIELD(HSTATE_SCRATCH2, scratch2);
+	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+	HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
+	HSTATE_FIELD(HSTATE_NAPPING, napping);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
+	HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
+	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_PHYS, xive_tima_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_VIRT, xive_tima_virt);
+	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
+	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+	HSTATE_FIELD(HSTATE_PTID, ptid);
+	HSTATE_FIELD(HSTATE_TID, tid);
+	HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend);
+	HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
+	HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
+	HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
+	HSTATE_FIELD(HSTATE_SIAR, host_mmcr[3]);
+	HSTATE_FIELD(HSTATE_SDAR, host_mmcr[4]);
+	HSTATE_FIELD(HSTATE_MMCR2, host_mmcr[5]);
+	HSTATE_FIELD(HSTATE_SIER, host_mmcr[6]);
+	HSTATE_FIELD(HSTATE_PMC1, host_pmc[0]);
+	HSTATE_FIELD(HSTATE_PMC2, host_pmc[1]);
+	HSTATE_FIELD(HSTATE_PMC3, host_pmc[2]);
+	HSTATE_FIELD(HSTATE_PMC4, host_pmc[3]);
+	HSTATE_FIELD(HSTATE_PMC5, host_pmc[4]);
+	HSTATE_FIELD(HSTATE_PMC6, host_pmc[5]);
+	HSTATE_FIELD(HSTATE_PURR, host_purr);
+	HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+	HSTATE_FIELD(HSTATE_DABR, dabr);
+	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+	HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode);
+	DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+	OFFSET(KVM_SPLIT_RPR, kvm_split_mode, rpr);
+	OFFSET(KVM_SPLIT_PMMAR, kvm_split_mode, pmmar);
+	OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
+	OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
+	OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+	OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+	OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	HSTATE_FIELD(HSTATE_CFAR, cfar);
+	HSTATE_FIELD(HSTATE_PPR, ppr);
+	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#else /* CONFIG_PPC_BOOK3S */
+	OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
+	OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
+	OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
+	OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
+	OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
+	OFFSET(VCPU_SPRG9, kvm_vcpu, arch.sprg9);
+	OFFSET(VCPU_LAST_INST, kvm_vcpu, arch.last_inst);
+	OFFSET(VCPU_FAULT_DEAR, kvm_vcpu, arch.fault_dear);
+	OFFSET(VCPU_FAULT_ESR, kvm_vcpu, arch.fault_esr);
+	OFFSET(VCPU_CRIT_SAVE, kvm_vcpu, arch.crit_save);
+#endif /* CONFIG_PPC_BOOK3S */
+#endif /* CONFIG_KVM */
+
+#ifdef CONFIG_KVM_GUEST
+	OFFSET(KVM_MAGIC_SCRATCH1, kvm_vcpu_arch_shared, scratch1);
+	OFFSET(KVM_MAGIC_SCRATCH2, kvm_vcpu_arch_shared, scratch2);
+	OFFSET(KVM_MAGIC_SCRATCH3, kvm_vcpu_arch_shared, scratch3);
+	OFFSET(KVM_MAGIC_INT, kvm_vcpu_arch_shared, int_pending);
+	OFFSET(KVM_MAGIC_MSR, kvm_vcpu_arch_shared, msr);
+	OFFSET(KVM_MAGIC_CRITICAL, kvm_vcpu_arch_shared, critical);
+	OFFSET(KVM_MAGIC_SR, kvm_vcpu_arch_shared, sr);
+#endif
+
+#ifdef CONFIG_44x
+	DEFINE(PGD_T_LOG2, PGD_T_LOG2);
+	DEFINE(PTE_T_LOG2, PTE_T_LOG2);
+#endif
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	DEFINE(TLBCAM_SIZE, sizeof(struct tlbcam));
+	OFFSET(TLBCAM_MAS0, tlbcam, MAS0);
+	OFFSET(TLBCAM_MAS1, tlbcam, MAS1);
+	OFFSET(TLBCAM_MAS2, tlbcam, MAS2);
+	OFFSET(TLBCAM_MAS3, tlbcam, MAS3);
+	OFFSET(TLBCAM_MAS7, tlbcam, MAS7);
+#endif
+
+#if defined(CONFIG_KVM) && defined(CONFIG_SPE)
+	OFFSET(VCPU_EVR, kvm_vcpu, arch.evr[0]);
+	OFFSET(VCPU_ACC, kvm_vcpu, arch.acc);
+	OFFSET(VCPU_SPEFSCR, kvm_vcpu, arch.spefscr);
+	OFFSET(VCPU_HOST_SPEFSCR, kvm_vcpu, arch.host_spefscr);
+#endif
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	OFFSET(VCPU_HOST_MAS4, kvm_vcpu, arch.host_mas4);
+	OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6);
+#endif
+
+#ifdef CONFIG_KVM_XICS
+	DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu,
+					       arch.xive_saved_state));
+	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
+					    arch.xive_cam_word));
+	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+	DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on));
+	DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr));
+	DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr));
+#endif
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu);
+	OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl);
+	OFFSET(VCPU_TIMING_LAST_ENTER_TBU, kvm_vcpu, arch.timing_last_enter.tv32.tbu);
+	OFFSET(VCPU_TIMING_LAST_ENTER_TBL, kvm_vcpu, arch.timing_last_enter.tv32.tbl);
+#endif
+
+#ifdef CONFIG_PPC_POWERNV
+	OFFSET(PACA_CORE_IDLE_STATE_PTR, paca_struct, core_idle_state_ptr);
+	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
+	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
+	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
+	OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
+	OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
+#define STOP_SPR(x, f)	OFFSET(x, paca_struct, stop_sprs.f)
+	STOP_SPR(STOP_PID, pid);
+	STOP_SPR(STOP_LDBAR, ldbar);
+	STOP_SPR(STOP_FSCR, fscr);
+	STOP_SPR(STOP_HFSCR, hfscr);
+	STOP_SPR(STOP_MMCR1, mmcr1);
+	STOP_SPR(STOP_MMCR2, mmcr2);
+	STOP_SPR(STOP_MMCRA, mmcra);
+#endif
+
+	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+	DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);
+
+#ifdef CONFIG_PPC_8xx
+	DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE));
+#endif
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c
new file mode 100644
index 000000000..a2dddd7f3
--- /dev/null
+++ b/arch/powerpc/kernel/audit.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+static unsigned signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int audit_classify_arch(int arch)
+{
+#ifdef CONFIG_PPC64
+	if (arch == AUDIT_ARCH_PPC)
+		return 1;
+#endif
+	return 0;
+}
+
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_PPC64
+	extern int ppc32_classify_syscall(unsigned);
+	if (abi == AUDIT_ARCH_PPC)
+		return ppc32_classify_syscall(syscall);
+#endif
+	switch(syscall) {
+	case __NR_open:
+		return 2;
+	case __NR_openat:
+		return 3;
+	case __NR_socketcall:
+		return 4;
+	case __NR_execve:
+		return 5;
+	default:
+		return 0;
+	}
+}
+
+static int __init audit_classes_init(void)
+{
+#ifdef CONFIG_PPC64
+	extern __u32 ppc32_dir_class[];
+	extern __u32 ppc32_write_class[];
+	extern __u32 ppc32_read_class[];
+	extern __u32 ppc32_chattr_class[];
+	extern __u32 ppc32_signal_class[];
+	audit_register_class(AUDIT_CLASS_WRITE_32, ppc32_write_class);
+	audit_register_class(AUDIT_CLASS_READ_32, ppc32_read_class);
+	audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ppc32_dir_class);
+	audit_register_class(AUDIT_CLASS_CHATTR_32, ppc32_chattr_class);
+	audit_register_class(AUDIT_CLASS_SIGNAL_32, ppc32_signal_class);
+#endif
+	audit_register_class(AUDIT_CLASS_WRITE, write_class);
+	audit_register_class(AUDIT_CLASS_READ, read_class);
+	audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+	audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+	audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
+	return 0;
+}
+
+__initcall(audit_classes_init);
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
new file mode 100644
index 000000000..a046504d8
--- /dev/null
+++ b/arch/powerpc/kernel/btext.c
@@ -0,0 +1,949 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Procedures for drawing on the screen early on in the boot process.
+ *
+ * Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
+
+#include <asm/sections.h>
+#include <asm/prom.h>
+#include <asm/btext.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/udbg.h>
+
+#define NO_SCROLL
+
+#ifndef NO_SCROLL
+static void scrollscreen(void);
+#endif
+
+#define __force_data __attribute__((__section__(".data")))
+
+static int g_loc_X __force_data;
+static int g_loc_Y __force_data;
+static int g_max_loc_X __force_data;
+static int g_max_loc_Y __force_data;
+
+static int dispDeviceRowBytes __force_data;
+static int dispDeviceDepth  __force_data;
+static int dispDeviceRect[4] __force_data;
+static unsigned char *dispDeviceBase __force_data;
+static unsigned char *logicalDisplayBase __force_data;
+
+unsigned long disp_BAT[2] __initdata = {0, 0};
+
+#define cmapsz	(16*256)
+
+static unsigned char vga_font[cmapsz];
+
+int boot_text_mapped __force_data = 0;
+int force_printk_to_btext = 0;
+
+extern void rmci_on(void);
+extern void rmci_off(void);
+
+static inline void rmci_maybe_on(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64)
+	if (!(mfmsr() & MSR_DR))
+		rmci_on();
+#endif
+}
+
+static inline void rmci_maybe_off(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64)
+	if (!(mfmsr() & MSR_DR))
+		rmci_off();
+#endif
+}
+
+
+#ifdef CONFIG_PPC32
+/* Calc BAT values for mapping the display and store them
+ * in disp_BAT.  Those values are then used from head.S to map
+ * the display during identify_machine() and MMU_Init()
+ *
+ * The display is mapped to virtual address 0xD0000000, rather
+ * than 1:1, because some some CHRP machines put the frame buffer
+ * in the region starting at 0xC0000000 (PAGE_OFFSET).
+ * This mapping is temporary and will disappear as soon as the
+ * setup done by MMU_Init() is applied.
+ *
+ * For now, we align the BAT and then map 8Mb on 601 and 16Mb
+ * on other PPCs. This may cause trouble if the framebuffer
+ * is really badly aligned, but I didn't encounter this case
+ * yet.
+ */
+void __init btext_prepare_BAT(void)
+{
+	unsigned long vaddr = PAGE_OFFSET + 0x10000000;
+	unsigned long addr;
+	unsigned long lowbits;
+
+	addr = (unsigned long)dispDeviceBase;
+	if (!addr) {
+		boot_text_mapped = 0;
+		return;
+	}
+	if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
+		/* 603, 604, G3, G4, ... */
+		lowbits = addr & ~0xFF000000UL;
+		addr &= 0xFF000000UL;
+		disp_BAT[0] = vaddr | (BL_16M<<2) | 2;
+		disp_BAT[1] = addr | (_PAGE_NO_CACHE | _PAGE_GUARDED | BPP_RW);	
+	} else {
+		/* 601 */
+		lowbits = addr & ~0xFF800000UL;
+		addr &= 0xFF800000UL;
+		disp_BAT[0] = vaddr | (_PAGE_NO_CACHE | PP_RWXX) | 4;
+		disp_BAT[1] = addr | BL_8M | 0x40;
+	}
+	logicalDisplayBase = (void *) (vaddr + lowbits);
+}
+#endif
+
+
+/* This function can be used to enable the early boot text when doing
+ * OF booting or within bootx init. It must be followed by a btext_unmap()
+ * call before the logical address becomes unusable
+ */
+void __init btext_setup_display(int width, int height, int depth, int pitch,
+				unsigned long address)
+{
+	g_loc_X = 0;
+	g_loc_Y = 0;
+	g_max_loc_X = width / 8;
+	g_max_loc_Y = height / 16;
+	logicalDisplayBase = (unsigned char *)address;
+	dispDeviceBase = (unsigned char *)address;
+	dispDeviceRowBytes = pitch;
+	dispDeviceDepth = depth == 15 ? 16 : depth;
+	dispDeviceRect[0] = dispDeviceRect[1] = 0;
+	dispDeviceRect[2] = width;
+	dispDeviceRect[3] = height;
+	boot_text_mapped = 1;
+}
+
+void __init btext_unmap(void)
+{
+	boot_text_mapped = 0;
+}
+
+/* Here's a small text engine to use during early boot
+ * or for debugging purposes
+ *
+ * todo:
+ *
+ *  - build some kind of vgacon with it to enable early printk
+ *  - move to a separate file
+ *  - add a few video driver hooks to keep in sync with display
+ *    changes.
+ */
+
+void btext_map(void)
+{
+	unsigned long base, offset, size;
+	unsigned char *vbase;
+
+	/* By default, we are no longer mapped */
+	boot_text_mapped = 0;
+	if (!dispDeviceBase)
+		return;
+	base = ((unsigned long) dispDeviceBase) & 0xFFFFF000UL;
+	offset = ((unsigned long) dispDeviceBase) - base;
+	size = dispDeviceRowBytes * dispDeviceRect[3] + offset
+		+ dispDeviceRect[0];
+	vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0))));
+	if (!vbase)
+		return;
+	logicalDisplayBase = vbase + offset;
+	boot_text_mapped = 1;
+}
+
+static int btext_initialize(struct device_node *np)
+{
+	unsigned int width, height, depth, pitch;
+	unsigned long address = 0;
+	const u32 *prop;
+
+	prop = of_get_property(np, "linux,bootx-width", NULL);
+	if (prop == NULL)
+		prop = of_get_property(np, "width", NULL);
+	if (prop == NULL)
+		return -EINVAL;
+	width = *prop;
+	prop = of_get_property(np, "linux,bootx-height", NULL);
+	if (prop == NULL)
+		prop = of_get_property(np, "height", NULL);
+	if (prop == NULL)
+		return -EINVAL;
+	height = *prop;
+	prop = of_get_property(np, "linux,bootx-depth", NULL);
+	if (prop == NULL)
+		prop = of_get_property(np, "depth", NULL);
+	if (prop == NULL)
+		return -EINVAL;
+	depth = *prop;
+	pitch = width * ((depth + 7) / 8);
+	prop = of_get_property(np, "linux,bootx-linebytes", NULL);
+	if (prop == NULL)
+		prop = of_get_property(np, "linebytes", NULL);
+	if (prop && *prop != 0xffffffffu)
+		pitch = *prop;
+	if (pitch == 1)
+		pitch = 0x1000;
+	prop = of_get_property(np, "linux,bootx-addr", NULL);
+	if (prop == NULL)
+		prop = of_get_property(np, "address", NULL);
+	if (prop)
+		address = *prop;
+
+	/* FIXME: Add support for PCI reg properties. Right now, only
+	 * reliable on macs
+	 */
+	if (address == 0)
+		return -EINVAL;
+
+	g_loc_X = 0;
+	g_loc_Y = 0;
+	g_max_loc_X = width / 8;
+	g_max_loc_Y = height / 16;
+	dispDeviceBase = (unsigned char *)address;
+	dispDeviceRowBytes = pitch;
+	dispDeviceDepth = depth == 15 ? 16 : depth;
+	dispDeviceRect[0] = dispDeviceRect[1] = 0;
+	dispDeviceRect[2] = width;
+	dispDeviceRect[3] = height;
+
+	btext_map();
+
+	return 0;
+}
+
+int __init btext_find_display(int allow_nonstdout)
+{
+	const char *name;
+	struct device_node *np = NULL; 
+	int rc = -ENODEV;
+
+	name = of_get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name != NULL) {
+		np = of_find_node_by_path(name);
+		if (np != NULL) {
+			if (strcmp(np->type, "display") != 0) {
+				printk("boot stdout isn't a display !\n");
+				of_node_put(np);
+				np = NULL;
+			}
+		}
+	}
+	if (np)
+		rc = btext_initialize(np);
+	if (rc == 0 || !allow_nonstdout)
+		return rc;
+
+	for_each_node_by_type(np, "display") {
+		if (of_get_property(np, "linux,opened", NULL)) {
+			printk("trying %pOF ...\n", np);
+			rc = btext_initialize(np);
+			printk("result: %d\n", rc);
+		}
+		if (rc == 0) {
+			of_node_put(np);
+			break;
+		}
+	}
+	return rc;
+}
+
+/* Calc the base address of a given point (x,y) */
+static unsigned char * calc_base(int x, int y)
+{
+	unsigned char *base;
+
+	base = logicalDisplayBase;
+	if (!base)
+		base = dispDeviceBase;
+	base += (x + dispDeviceRect[0]) * (dispDeviceDepth >> 3);
+	base += (y + dispDeviceRect[1]) * dispDeviceRowBytes;
+	return base;
+}
+
+/* Adjust the display to a new resolution */
+void btext_update_display(unsigned long phys, int width, int height,
+			  int depth, int pitch)
+{
+	if (!dispDeviceBase)
+		return;
+
+	/* check it's the same frame buffer (within 256MB) */
+	if ((phys ^ (unsigned long)dispDeviceBase) & 0xf0000000)
+		return;
+
+	dispDeviceBase = (__u8 *) phys;
+	dispDeviceRect[0] = 0;
+	dispDeviceRect[1] = 0;
+	dispDeviceRect[2] = width;
+	dispDeviceRect[3] = height;
+	dispDeviceDepth = depth;
+	dispDeviceRowBytes = pitch;
+	if (boot_text_mapped) {
+		iounmap(logicalDisplayBase);
+		boot_text_mapped = 0;
+	}
+	btext_map();
+	g_loc_X = 0;
+	g_loc_Y = 0;
+	g_max_loc_X = width / 8;
+	g_max_loc_Y = height / 16;
+}
+EXPORT_SYMBOL(btext_update_display);
+
+void btext_clearscreen(void)
+{
+	unsigned int *base	= (unsigned int *)calc_base(0, 0);
+	unsigned long width 	= ((dispDeviceRect[2] - dispDeviceRect[0]) *
+					(dispDeviceDepth >> 3)) >> 2;
+	int i,j;
+
+	rmci_maybe_on();
+	for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1]); i++)
+	{
+		unsigned int *ptr = base;
+		for(j=width; j; --j)
+			*(ptr++) = 0;
+		base += (dispDeviceRowBytes >> 2);
+	}
+	rmci_maybe_off();
+}
+
+void btext_flushscreen(void)
+{
+	unsigned int *base	= (unsigned int *)calc_base(0, 0);
+	unsigned long width 	= ((dispDeviceRect[2] - dispDeviceRect[0]) *
+					(dispDeviceDepth >> 3)) >> 2;
+	int i,j;
+
+	for (i=0; i < (dispDeviceRect[3] - dispDeviceRect[1]); i++)
+	{
+		unsigned int *ptr = base;
+		for(j = width; j > 0; j -= 8) {
+			__asm__ __volatile__ ("dcbst 0,%0" :: "r" (ptr));
+			ptr += 8;
+		}
+		base += (dispDeviceRowBytes >> 2);
+	}
+	__asm__ __volatile__ ("sync" ::: "memory");
+}
+
+void btext_flushline(void)
+{
+	unsigned int *base	= (unsigned int *)calc_base(0, g_loc_Y << 4);
+	unsigned long width 	= ((dispDeviceRect[2] - dispDeviceRect[0]) *
+					(dispDeviceDepth >> 3)) >> 2;
+	int i,j;
+
+	for (i=0; i < 16; i++)
+	{
+		unsigned int *ptr = base;
+		for(j = width; j > 0; j -= 8) {
+			__asm__ __volatile__ ("dcbst 0,%0" :: "r" (ptr));
+			ptr += 8;
+		}
+		base += (dispDeviceRowBytes >> 2);
+	}
+	__asm__ __volatile__ ("sync" ::: "memory");
+}
+
+
+#ifndef NO_SCROLL
+static void scrollscreen(void)
+{
+	unsigned int *src     	= (unsigned int *)calc_base(0,16);
+	unsigned int *dst     	= (unsigned int *)calc_base(0,0);
+	unsigned long width    	= ((dispDeviceRect[2] - dispDeviceRect[0]) *
+				   (dispDeviceDepth >> 3)) >> 2;
+	int i,j;
+
+	rmci_maybe_on();
+
+	for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++)
+	{
+		unsigned int *src_ptr = src;
+		unsigned int *dst_ptr = dst;
+		for(j=width; j; --j)
+			*(dst_ptr++) = *(src_ptr++);
+		src += (dispDeviceRowBytes >> 2);
+		dst += (dispDeviceRowBytes >> 2);
+	}
+	for (i=0; i<16; i++)
+	{
+		unsigned int *dst_ptr = dst;
+		for(j=width; j; --j)
+			*(dst_ptr++) = 0;
+		dst += (dispDeviceRowBytes >> 2);
+	}
+
+	rmci_maybe_off();
+}
+#endif /* ndef NO_SCROLL */
+
+static unsigned int expand_bits_8[16] = {
+	0x00000000,
+	0x000000ff,
+	0x0000ff00,
+	0x0000ffff,
+	0x00ff0000,
+	0x00ff00ff,
+	0x00ffff00,
+	0x00ffffff,
+	0xff000000,
+	0xff0000ff,
+	0xff00ff00,
+	0xff00ffff,
+	0xffff0000,
+	0xffff00ff,
+	0xffffff00,
+	0xffffffff
+};
+
+static unsigned int expand_bits_16[4] = {
+	0x00000000,
+	0x0000ffff,
+	0xffff0000,
+	0xffffffff
+};
+
+
+static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0xFFFFFFFFUL;
+	int bg = 0x00000000UL;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (-(bits >> 7) & fg) ^ bg;
+		base[1] = (-((bits >> 6) & 1) & fg) ^ bg;
+		base[2] = (-((bits >> 5) & 1) & fg) ^ bg;
+		base[3] = (-((bits >> 4) & 1) & fg) ^ bg;
+		base[4] = (-((bits >> 3) & 1) & fg) ^ bg;
+		base[5] = (-((bits >> 2) & 1) & fg) ^ bg;
+		base[6] = (-((bits >> 1) & 1) & fg) ^ bg;
+		base[7] = (-(bits & 1) & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0xFFFFFFFFUL;
+	int bg = 0x00000000UL;
+	unsigned int *eb = (int *)expand_bits_16;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (eb[bits >> 6] & fg) ^ bg;
+		base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg;
+		base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg;
+		base[3] = (eb[bits & 3] & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+{
+	int l, bits;
+	int fg = 0x0F0F0F0FUL;
+	int bg = 0x00000000UL;
+	unsigned int *eb = (int *)expand_bits_8;
+
+	for (l = 0; l < 16; ++l)
+	{
+		bits = *font++;
+		base[0] = (eb[bits >> 4] & fg) ^ bg;
+		base[1] = (eb[bits & 0xf] & fg) ^ bg;
+		base = (unsigned int *) ((char *)base + rb);
+	}
+}
+
+static noinline void draw_byte(unsigned char c, long locX, long locY)
+{
+	unsigned char *base	= calc_base(locX << 3, locY << 4);
+	unsigned char *font	= &vga_font[((unsigned int)c) * 16];
+	int rb			= dispDeviceRowBytes;
+
+	rmci_maybe_on();
+	switch(dispDeviceDepth) {
+	case 24:
+	case 32:
+		draw_byte_32(font, (unsigned int *)base, rb);
+		break;
+	case 15:
+	case 16:
+		draw_byte_16(font, (unsigned int *)base, rb);
+		break;
+	case 8:
+		draw_byte_8(font, (unsigned int *)base, rb);
+		break;
+	}
+	rmci_maybe_off();
+}
+
+void btext_drawchar(char c)
+{
+	int cline = 0;
+#ifdef NO_SCROLL
+	int x;
+#endif
+	if (!boot_text_mapped)
+		return;
+
+	switch (c) {
+	case '\b':
+		if (g_loc_X > 0)
+			--g_loc_X;
+		break;
+	case '\t':
+		g_loc_X = (g_loc_X & -8) + 8;
+		break;
+	case '\r':
+		g_loc_X = 0;
+		break;
+	case '\n':
+		g_loc_X = 0;
+		g_loc_Y++;
+		cline = 1;
+		break;
+	default:
+		draw_byte(c, g_loc_X++, g_loc_Y);
+	}
+	if (g_loc_X >= g_max_loc_X) {
+		g_loc_X = 0;
+		g_loc_Y++;
+		cline = 1;
+	}
+#ifndef NO_SCROLL
+	while (g_loc_Y >= g_max_loc_Y) {
+		scrollscreen();
+		g_loc_Y--;
+	}
+#else
+	/* wrap around from bottom to top of screen so we don't
+	   waste time scrolling each line.  -- paulus. */
+	if (g_loc_Y >= g_max_loc_Y)
+		g_loc_Y = 0;
+	if (cline) {
+		for (x = 0; x < g_max_loc_X; ++x)
+			draw_byte(' ', x, g_loc_Y);
+	}
+#endif
+}
+
+void btext_drawstring(const char *c)
+{
+	if (!boot_text_mapped)
+		return;
+	while (*c)
+		btext_drawchar(*c++);
+}
+
+void btext_drawtext(const char *c, unsigned int len)
+{
+	if (!boot_text_mapped)
+		return;
+	while (len--)
+		btext_drawchar(*c++);
+}
+
+void btext_drawhex(unsigned long v)
+{
+	if (!boot_text_mapped)
+		return;
+#ifdef CONFIG_PPC64
+	btext_drawchar(hex_asc_hi(v >> 56));
+	btext_drawchar(hex_asc_lo(v >> 56));
+	btext_drawchar(hex_asc_hi(v >> 48));
+	btext_drawchar(hex_asc_lo(v >> 48));
+	btext_drawchar(hex_asc_hi(v >> 40));
+	btext_drawchar(hex_asc_lo(v >> 40));
+	btext_drawchar(hex_asc_hi(v >> 32));
+	btext_drawchar(hex_asc_lo(v >> 32));
+#endif
+	btext_drawchar(hex_asc_hi(v >> 24));
+	btext_drawchar(hex_asc_lo(v >> 24));
+	btext_drawchar(hex_asc_hi(v >> 16));
+	btext_drawchar(hex_asc_lo(v >> 16));
+	btext_drawchar(hex_asc_hi(v >> 8));
+	btext_drawchar(hex_asc_lo(v >> 8));
+	btext_drawchar(hex_asc_hi(v));
+	btext_drawchar(hex_asc_lo(v));
+	btext_drawchar(' ');
+}
+
+void __init udbg_init_btext(void)
+{
+	/* If btext is enabled, we might have a BAT setup for early display,
+	 * thus we do enable some very basic udbg output
+	 */
+	udbg_putc = btext_drawchar;
+}
+
+static unsigned char vga_font[cmapsz] = {
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd,
+0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xff,
+0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe,
+0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18,
+0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c,
+0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x42, 0x42, 0x66, 0x3c, 0x00,
+0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xc3, 0x99, 0xbd,
+0xbd, 0x99, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x1e, 0x0e,
+0x1a, 0x32, 0x78, 0xcc, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x3c, 0x66, 0x66, 0x66, 0x66, 0x3c, 0x18, 0x7e, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x33, 0x3f, 0x30, 0x30, 0x30,
+0x30, 0x70, 0xf0, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x63,
+0x7f, 0x63, 0x63, 0x63, 0x63, 0x67, 0xe7, 0xe6, 0xc0, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x18, 0x18, 0xdb, 0x3c, 0xe7, 0x3c, 0xdb, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfe, 0xf8,
+0xf0, 0xe0, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x0e,
+0x1e, 0x3e, 0xfe, 0x3e, 0x1e, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+0x66, 0x00, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xdb,
+0xdb, 0xdb, 0x7b, 0x1b, 0x1b, 0x1b, 0x1b, 0x1b, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x7c, 0xc6, 0x60, 0x38, 0x6c, 0xc6, 0xc6, 0x6c, 0x38, 0x0c, 0xc6,
+0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0xfe, 0xfe, 0xfe, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c,
+0x7e, 0x18, 0x18, 0x18, 0x7e, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x3c, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x7e, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x18, 0x0c, 0xfe, 0x0c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x60, 0xfe, 0x60, 0x30, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0xc0,
+0xc0, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x24, 0x66, 0xff, 0x66, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x38, 0x7c, 0x7c, 0xfe, 0xfe, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0x7c, 0x7c,
+0x38, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x66, 0x66, 0x24, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6c,
+0x6c, 0xfe, 0x6c, 0x6c, 0x6c, 0xfe, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x18, 0x7c, 0xc6, 0xc2, 0xc0, 0x7c, 0x06, 0x06, 0x86, 0xc6, 0x7c,
+0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2, 0xc6, 0x0c, 0x18,
+0x30, 0x60, 0xc6, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c,
+0x6c, 0x38, 0x76, 0xdc, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x30, 0x30, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x30, 0x30, 0x30,
+0x30, 0x30, 0x18, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x18,
+0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x3c, 0xff, 0x3c, 0x66, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e,
+0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x02, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x80, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xce, 0xde, 0xf6, 0xe6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x38, 0x78, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6,
+0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0x06, 0x06, 0x3c, 0x06, 0x06, 0x06, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x1c, 0x3c, 0x6c, 0xcc, 0xfe,
+0x0c, 0x0c, 0x0c, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0,
+0xc0, 0xc0, 0xfc, 0x06, 0x06, 0x06, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x60, 0xc0, 0xc0, 0xfc, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc6, 0x06, 0x06, 0x0c, 0x18,
+0x30, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6,
+0xc6, 0xc6, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x06, 0x06, 0x0c, 0x78,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00,
+0x00, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x18, 0x00, 0x00, 0x00, 0x18, 0x18, 0x30, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x06,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00,
+0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60,
+0x30, 0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x60, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0x0c, 0x18, 0x18, 0x18, 0x00, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xde, 0xde,
+0xde, 0xdc, 0xc0, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38,
+0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x66, 0x66, 0x66, 0x66, 0xfc,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0xc2, 0xc0, 0xc0, 0xc0,
+0xc0, 0xc2, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x6c,
+0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x6c, 0xf8, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68, 0x60, 0x62, 0x66, 0xfe,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x66, 0x62, 0x68, 0x78, 0x68,
+0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66,
+0xc2, 0xc0, 0xc0, 0xde, 0xc6, 0xc6, 0x66, 0x3a, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x0c,
+0x0c, 0x0c, 0x0c, 0x0c, 0xcc, 0xcc, 0xcc, 0x78, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xe6, 0x66, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0x66, 0xe6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x60, 0x60, 0x60, 0x60, 0x60,
+0x60, 0x62, 0x66, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xe7,
+0xff, 0xff, 0xdb, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6, 0xc6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66,
+0x66, 0x66, 0x7c, 0x60, 0x60, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xd6, 0xde, 0x7c,
+0x0c, 0x0e, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, 0x66, 0x66, 0x7c, 0x6c,
+0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6,
+0xc6, 0x60, 0x38, 0x0c, 0x06, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xff, 0xdb, 0x99, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3,
+0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xc3, 0xc3, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x66,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0x66, 0x3c, 0x18, 0x18,
+0x3c, 0x66, 0xc3, 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3,
+0xc3, 0x66, 0x3c, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xff, 0xc3, 0x86, 0x0c, 0x18, 0x30, 0x60, 0xc1, 0xc3, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x30, 0x30, 0x30, 0x30, 0x30,
+0x30, 0x30, 0x30, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+0xc0, 0xe0, 0x70, 0x38, 0x1c, 0x0e, 0x06, 0x02, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x3c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
+0x30, 0x30, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x0c, 0x7c,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0x60,
+0x60, 0x78, 0x6c, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc0, 0xc0, 0xc0, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x0c, 0x3c, 0x6c, 0xcc,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xf0,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xcc, 0xcc,
+0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0xcc, 0x78, 0x00, 0x00, 0x00, 0xe0, 0x60,
+0x60, 0x6c, 0x76, 0x66, 0x66, 0x66, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x18, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x06, 0x00, 0x0e, 0x06, 0x06,
+0x06, 0x06, 0x06, 0x06, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0xe0, 0x60,
+0x60, 0x66, 0x6c, 0x78, 0x78, 0x6c, 0x66, 0xe6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xdb,
+0xdb, 0xdb, 0xdb, 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x66, 0x66,
+0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x76, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x7c, 0x0c, 0x0c, 0x1e, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x76, 0x66, 0x60, 0x60, 0x60, 0xf0,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xc6, 0x60,
+0x38, 0x0c, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x30,
+0x30, 0xfc, 0x30, 0x30, 0x30, 0x30, 0x36, 0x1c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0xc3, 0xc3,
+0xc3, 0x66, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xc3, 0xc3, 0xc3, 0xdb, 0xdb, 0xff, 0x66, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0x3c, 0x66, 0xc3,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xfe, 0xcc, 0x18, 0x30, 0x60, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x0e, 0x18, 0x18, 0x18, 0x70, 0x18, 0x18, 0x18, 0x18, 0x0e,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x00, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x18,
+0x18, 0x18, 0x0e, 0x18, 0x18, 0x18, 0x18, 0x70, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0xc6,
+0xc6, 0xc6, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x66,
+0xc2, 0xc0, 0xc0, 0xc0, 0xc2, 0x66, 0x3c, 0x0c, 0x06, 0x7c, 0x00, 0x00,
+0x00, 0x00, 0xcc, 0x00, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x00, 0x7c, 0xc6, 0xfe,
+0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c,
+0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xcc, 0x00, 0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x00, 0x78, 0x0c, 0x7c,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x38,
+0x00, 0x78, 0x0c, 0x7c, 0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x3c, 0x66, 0x60, 0x60, 0x66, 0x3c, 0x0c, 0x06,
+0x3c, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xfe,
+0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00,
+0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x60, 0x30, 0x18, 0x00, 0x7c, 0xc6, 0xfe, 0xc0, 0xc0, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x38, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c, 0x66,
+0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x60, 0x30, 0x18, 0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x10, 0x38, 0x6c, 0xc6, 0xc6,
+0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x38, 0x00,
+0x38, 0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x30, 0x60, 0x00, 0xfe, 0x66, 0x60, 0x7c, 0x60, 0x60, 0x66, 0xfe,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0x3b, 0x1b,
+0x7e, 0xd8, 0xdc, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3e, 0x6c,
+0xcc, 0xcc, 0xfe, 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x10, 0x38, 0x6c, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x00, 0x7c, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18,
+0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x30, 0x78, 0xcc, 0x00, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x76,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x30, 0x18, 0x00, 0xcc, 0xcc, 0xcc,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00,
+0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7e, 0x06, 0x0c, 0x78, 0x00,
+0x00, 0xc6, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6,
+0xc6, 0xc6, 0xc6, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e,
+0xc3, 0xc0, 0xc0, 0xc0, 0xc3, 0x7e, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x38, 0x6c, 0x64, 0x60, 0xf0, 0x60, 0x60, 0x60, 0x60, 0xe6, 0xfc,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc3, 0x66, 0x3c, 0x18, 0xff, 0x18,
+0xff, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x66, 0x66,
+0x7c, 0x62, 0x66, 0x6f, 0x66, 0x66, 0x66, 0xf3, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x0e, 0x1b, 0x18, 0x18, 0x18, 0x7e, 0x18, 0x18, 0x18, 0x18, 0x18,
+0xd8, 0x70, 0x00, 0x00, 0x00, 0x18, 0x30, 0x60, 0x00, 0x78, 0x0c, 0x7c,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x18, 0x30,
+0x00, 0x38, 0x18, 0x18, 0x18, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x18, 0x30, 0x60, 0x00, 0x7c, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x30, 0x60, 0x00, 0xcc, 0xcc, 0xcc,
+0xcc, 0xcc, 0xcc, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc,
+0x00, 0xdc, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x00, 0x00, 0x00, 0x00,
+0x76, 0xdc, 0x00, 0xc6, 0xe6, 0xf6, 0xfe, 0xde, 0xce, 0xc6, 0xc6, 0xc6,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x3c, 0x6c, 0x6c, 0x3e, 0x00, 0x7e, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x6c,
+0x38, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x30, 0x30, 0x00, 0x30, 0x30, 0x60, 0xc0, 0xc6, 0xc6, 0x7c,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc0,
+0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0xfe, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30, 0x60, 0xce, 0x9b, 0x06,
+0x0c, 0x1f, 0x00, 0x00, 0x00, 0xc0, 0xc0, 0xc2, 0xc6, 0xcc, 0x18, 0x30,
+0x66, 0xce, 0x96, 0x3e, 0x06, 0x06, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18,
+0x00, 0x18, 0x18, 0x18, 0x3c, 0x3c, 0x3c, 0x18, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x6c, 0xd8, 0x6c, 0x36, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd8, 0x6c, 0x36,
+0x6c, 0xd8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x44, 0x11, 0x44,
+0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44, 0x11, 0x44,
+0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa, 0x55, 0xaa,
+0x55, 0xaa, 0x55, 0xaa, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77,
+0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0xdd, 0x77, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x18, 0xf8,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36,
+0x36, 0xf6, 0x06, 0xf6, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x06, 0xf6,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0xf6, 0x06, 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xfe, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8, 0x18, 0xf8,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0xf8, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x37,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x37, 0x30, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xf7, 0x00, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xff, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x37, 0x30, 0x37, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36,
+0x36, 0xf7, 0x00, 0xf7, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0xff,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xff, 0x00, 0xff, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x3f,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x1f, 0x18, 0x1f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x18, 0x1f, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f,
+0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x36, 0x36, 0x36, 0xff, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36, 0x36,
+0x18, 0x18, 0x18, 0x18, 0x18, 0xff, 0x18, 0xff, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0xf8,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x1f, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
+0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xf0, 0xf0, 0xf0,
+0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+0x0f, 0x0f, 0x0f, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x76, 0xdc, 0xd8, 0xd8, 0xd8, 0xdc, 0x76, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x78, 0xcc, 0xcc, 0xcc, 0xd8, 0xcc, 0xc6, 0xc6, 0xc6, 0xcc,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xc6, 0xc6, 0xc0, 0xc0, 0xc0,
+0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0xfe, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0xfe, 0xc6, 0x60, 0x30, 0x18, 0x30, 0x60, 0xc6, 0xfe,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xd8, 0xd8,
+0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x66, 0x66, 0x66, 0x66, 0x66, 0x7c, 0x60, 0x60, 0xc0, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x18, 0x3c, 0x66, 0x66,
+0x66, 0x3c, 0x18, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38,
+0x6c, 0xc6, 0xc6, 0xfe, 0xc6, 0xc6, 0x6c, 0x38, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x38, 0x6c, 0xc6, 0xc6, 0xc6, 0x6c, 0x6c, 0x6c, 0x6c, 0xee,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x30, 0x18, 0x0c, 0x3e, 0x66,
+0x66, 0x66, 0x66, 0x3c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x7e, 0xdb, 0xdb, 0xdb, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x03, 0x06, 0x7e, 0xdb, 0xdb, 0xf3, 0x7e, 0x60, 0xc0,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x30, 0x60, 0x60, 0x7c, 0x60,
+0x60, 0x60, 0x30, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c,
+0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0xc6, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00, 0x00, 0xfe, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x7e, 0x18,
+0x18, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30,
+0x18, 0x0c, 0x06, 0x0c, 0x18, 0x30, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x0c, 0x18, 0x30, 0x60, 0x30, 0x18, 0x0c, 0x00, 0x7e,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x1b, 0x1b, 0x1b, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0xd8, 0xd8, 0xd8, 0x70, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x7e, 0x00, 0x18, 0x18, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0xdc, 0x00,
+0x76, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x6c, 0x6c,
+0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x18, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x0c, 0x0c,
+0x0c, 0x0c, 0x0c, 0xec, 0x6c, 0x6c, 0x3c, 0x1c, 0x00, 0x00, 0x00, 0x00,
+0x00, 0xd8, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0xd8, 0x30, 0x60, 0xc8, 0xf8, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+0x00, 0x00, 0x00, 0x00,
+};
+
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
new file mode 100644
index 000000000..9edb45430
--- /dev/null
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -0,0 +1,889 @@
+/*
+ * Processor cache information made available to userspace via sysfs;
+ * intended to be compatible with x86 intel_cacheinfo implementation.
+ *
+ * Copyright 2008 IBM Corporation
+ * Author: Nathan Lynch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/prom.h>
+
+#include "cacheinfo.h"
+
+/* per-cpu object for tracking:
+ * - a "cache" kobject for the top-level directory
+ * - a list of "index" objects representing the cpu's local cache hierarchy
+ */
+struct cache_dir {
+	struct kobject *kobj; /* bare (not embedded) kobject for cache
+			       * directory */
+	struct cache_index_dir *index; /* list of index objects */
+};
+
+/* "index" object: each cpu's cache directory has an index
+ * subdirectory corresponding to a cache object associated with the
+ * cpu.  This object's lifetime is managed via the embedded kobject.
+ */
+struct cache_index_dir {
+	struct kobject kobj;
+	struct cache_index_dir *next; /* next index in parent directory */
+	struct cache *cache;
+};
+
+/* Template for determining which OF properties to query for a given
+ * cache type */
+struct cache_type_info {
+	const char *name;
+	const char *size_prop;
+
+	/* Allow for both [di]-cache-line-size and
+	 * [di]-cache-block-size properties.  According to the PowerPC
+	 * Processor binding, -line-size should be provided if it
+	 * differs from the cache block size (that which is operated
+	 * on by cache instructions), so we look for -line-size first.
+	 * See cache_get_line_size(). */
+
+	const char *line_size_props[2];
+	const char *nr_sets_prop;
+};
+
+/* These are used to index the cache_type_info array. */
+#define CACHE_TYPE_UNIFIED     0 /* cache-size, cache-block-size, etc. */
+#define CACHE_TYPE_UNIFIED_D   1 /* d-cache-size, d-cache-block-size, etc */
+#define CACHE_TYPE_INSTRUCTION 2
+#define CACHE_TYPE_DATA        3
+
+static const struct cache_type_info cache_type_info[] = {
+	{
+		/* Embedded systems that use cache-size, cache-block-size,
+		 * etc. for the Unified (typically L2) cache. */
+		.name            = "Unified",
+		.size_prop       = "cache-size",
+		.line_size_props = { "cache-line-size",
+				     "cache-block-size", },
+		.nr_sets_prop    = "cache-sets",
+	},
+	{
+		/* PowerPC Processor binding says the [di]-cache-*
+		 * must be equal on unified caches, so just use
+		 * d-cache properties. */
+		.name            = "Unified",
+		.size_prop       = "d-cache-size",
+		.line_size_props = { "d-cache-line-size",
+				     "d-cache-block-size", },
+		.nr_sets_prop    = "d-cache-sets",
+	},
+	{
+		.name            = "Instruction",
+		.size_prop       = "i-cache-size",
+		.line_size_props = { "i-cache-line-size",
+				     "i-cache-block-size", },
+		.nr_sets_prop    = "i-cache-sets",
+	},
+	{
+		.name            = "Data",
+		.size_prop       = "d-cache-size",
+		.line_size_props = { "d-cache-line-size",
+				     "d-cache-block-size", },
+		.nr_sets_prop    = "d-cache-sets",
+	},
+};
+
+/* Cache object: each instance of this corresponds to a distinct cache
+ * in the system.  There are separate objects for Harvard caches: one
+ * each for instruction and data, and each refers to the same OF node.
+ * The refcount of the OF node is elevated for the lifetime of the
+ * cache object.  A cache object is released when its shared_cpu_map
+ * is cleared (see cache_cpu_clear).
+ *
+ * A cache object is on two lists: an unsorted global list
+ * (cache_list) of cache objects; and a singly-linked list
+ * representing the local cache hierarchy, which is ordered by level
+ * (e.g. L1d -> L1i -> L2 -> L3).
+ */
+struct cache {
+	struct device_node *ofnode;    /* OF node for this cache, may be cpu */
+	struct cpumask shared_cpu_map; /* online CPUs using this cache */
+	int type;                      /* split cache disambiguation */
+	int level;                     /* level not explicit in device tree */
+	struct list_head list;         /* global list of cache objects */
+	struct cache *next_local;      /* next cache of >= level */
+};
+
+static DEFINE_PER_CPU(struct cache_dir *, cache_dir_pcpu);
+
+/* traversal/modification of this list occurs only at cpu hotplug time;
+ * access is serialized by cpu hotplug locking
+ */
+static LIST_HEAD(cache_list);
+
+static struct cache_index_dir *kobj_to_cache_index_dir(struct kobject *k)
+{
+	return container_of(k, struct cache_index_dir, kobj);
+}
+
+static const char *cache_type_string(const struct cache *cache)
+{
+	return cache_type_info[cache->type].name;
+}
+
+static void cache_init(struct cache *cache, int type, int level,
+		       struct device_node *ofnode)
+{
+	cache->type = type;
+	cache->level = level;
+	cache->ofnode = of_node_get(ofnode);
+	INIT_LIST_HEAD(&cache->list);
+	list_add(&cache->list, &cache_list);
+}
+
+static struct cache *new_cache(int type, int level, struct device_node *ofnode)
+{
+	struct cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+	if (cache)
+		cache_init(cache, type, level, ofnode);
+
+	return cache;
+}
+
+static void release_cache_debugcheck(struct cache *cache)
+{
+	struct cache *iter;
+
+	list_for_each_entry(iter, &cache_list, list)
+		WARN_ONCE(iter->next_local == cache,
+			  "cache for %pOF(%s) refers to cache for %pOF(%s)\n",
+			  iter->ofnode,
+			  cache_type_string(iter),
+			  cache->ofnode,
+			  cache_type_string(cache));
+}
+
+static void release_cache(struct cache *cache)
+{
+	if (!cache)
+		return;
+
+	pr_debug("freeing L%d %s cache for %pOF\n", cache->level,
+		 cache_type_string(cache), cache->ofnode);
+
+	release_cache_debugcheck(cache);
+	list_del(&cache->list);
+	of_node_put(cache->ofnode);
+	kfree(cache);
+}
+
+static void cache_cpu_set(struct cache *cache, int cpu)
+{
+	struct cache *next = cache;
+
+	while (next) {
+		WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map),
+			  "CPU %i already accounted in %pOF(%s)\n",
+			  cpu, next->ofnode,
+			  cache_type_string(next));
+		cpumask_set_cpu(cpu, &next->shared_cpu_map);
+		next = next->next_local;
+	}
+}
+
+static int cache_size(const struct cache *cache, unsigned int *ret)
+{
+	const char *propname;
+	const __be32 *cache_size;
+
+	propname = cache_type_info[cache->type].size_prop;
+
+	cache_size = of_get_property(cache->ofnode, propname, NULL);
+	if (!cache_size)
+		return -ENODEV;
+
+	*ret = of_read_number(cache_size, 1);
+	return 0;
+}
+
+static int cache_size_kb(const struct cache *cache, unsigned int *ret)
+{
+	unsigned int size;
+
+	if (cache_size(cache, &size))
+		return -ENODEV;
+
+	*ret = size / 1024;
+	return 0;
+}
+
+/* not cache_line_size() because that's a macro in include/linux/cache.h */
+static int cache_get_line_size(const struct cache *cache, unsigned int *ret)
+{
+	const __be32 *line_size;
+	int i, lim;
+
+	lim = ARRAY_SIZE(cache_type_info[cache->type].line_size_props);
+
+	for (i = 0; i < lim; i++) {
+		const char *propname;
+
+		propname = cache_type_info[cache->type].line_size_props[i];
+		line_size = of_get_property(cache->ofnode, propname, NULL);
+		if (line_size)
+			break;
+	}
+
+	if (!line_size)
+		return -ENODEV;
+
+	*ret = of_read_number(line_size, 1);
+	return 0;
+}
+
+static int cache_nr_sets(const struct cache *cache, unsigned int *ret)
+{
+	const char *propname;
+	const __be32 *nr_sets;
+
+	propname = cache_type_info[cache->type].nr_sets_prop;
+
+	nr_sets = of_get_property(cache->ofnode, propname, NULL);
+	if (!nr_sets)
+		return -ENODEV;
+
+	*ret = of_read_number(nr_sets, 1);
+	return 0;
+}
+
+static int cache_associativity(const struct cache *cache, unsigned int *ret)
+{
+	unsigned int line_size;
+	unsigned int nr_sets;
+	unsigned int size;
+
+	if (cache_nr_sets(cache, &nr_sets))
+		goto err;
+
+	/* If the cache is fully associative, there is no need to
+	 * check the other properties.
+	 */
+	if (nr_sets == 1) {
+		*ret = 0;
+		return 0;
+	}
+
+	if (cache_get_line_size(cache, &line_size))
+		goto err;
+	if (cache_size(cache, &size))
+		goto err;
+
+	if (!(nr_sets > 0 && size > 0 && line_size > 0))
+		goto err;
+
+	*ret = (size / nr_sets) / line_size;
+	return 0;
+err:
+	return -ENODEV;
+}
+
+/* helper for dealing with split caches */
+static struct cache *cache_find_first_sibling(struct cache *cache)
+{
+	struct cache *iter;
+
+	if (cache->type == CACHE_TYPE_UNIFIED ||
+	    cache->type == CACHE_TYPE_UNIFIED_D)
+		return cache;
+
+	list_for_each_entry(iter, &cache_list, list)
+		if (iter->ofnode == cache->ofnode && iter->next_local == cache)
+			return iter;
+
+	return cache;
+}
+
+/* return the first cache on a local list matching node */
+static struct cache *cache_lookup_by_node(const struct device_node *node)
+{
+	struct cache *cache = NULL;
+	struct cache *iter;
+
+	list_for_each_entry(iter, &cache_list, list) {
+		if (iter->ofnode != node)
+			continue;
+		cache = cache_find_first_sibling(iter);
+		break;
+	}
+
+	return cache;
+}
+
+static bool cache_node_is_unified(const struct device_node *np)
+{
+	return of_get_property(np, "cache-unified", NULL);
+}
+
+/*
+ * Unified caches can have two different sets of tags.  Most embedded
+ * use cache-size, etc. for the unified cache size, but open firmware systems
+ * use d-cache-size, etc.   Check on initialization for which type we have, and
+ * return the appropriate structure type.  Assume it's embedded if it isn't
+ * open firmware.  If it's yet a 3rd type, then there will be missing entries
+ * in /sys/devices/system/cpu/cpu0/cache/index2/, and this code will need
+ * to be extended further.
+ */
+static int cache_is_unified_d(const struct device_node *np)
+{
+	return of_get_property(np,
+		cache_type_info[CACHE_TYPE_UNIFIED_D].size_prop, NULL) ?
+		CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED;
+}
+
+/*
+ */
+static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level)
+{
+	pr_debug("creating L%d ucache for %pOF\n", level, node);
+
+	return new_cache(cache_is_unified_d(node), level, node);
+}
+
+static struct cache *cache_do_one_devnode_split(struct device_node *node,
+						int level)
+{
+	struct cache *dcache, *icache;
+
+	pr_debug("creating L%d dcache and icache for %pOF\n", level,
+		 node);
+
+	dcache = new_cache(CACHE_TYPE_DATA, level, node);
+	icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node);
+
+	if (!dcache || !icache)
+		goto err;
+
+	dcache->next_local = icache;
+
+	return dcache;
+err:
+	release_cache(dcache);
+	release_cache(icache);
+	return NULL;
+}
+
+static struct cache *cache_do_one_devnode(struct device_node *node, int level)
+{
+	struct cache *cache;
+
+	if (cache_node_is_unified(node))
+		cache = cache_do_one_devnode_unified(node, level);
+	else
+		cache = cache_do_one_devnode_split(node, level);
+
+	return cache;
+}
+
+static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+						 int level)
+{
+	struct cache *cache;
+
+	cache = cache_lookup_by_node(node);
+
+	WARN_ONCE(cache && cache->level != level,
+		  "cache level mismatch on lookup (got %d, expected %d)\n",
+		  cache->level, level);
+
+	if (!cache)
+		cache = cache_do_one_devnode(node, level);
+
+	return cache;
+}
+
+static void link_cache_lists(struct cache *smaller, struct cache *bigger)
+{
+	while (smaller->next_local) {
+		if (smaller->next_local == bigger)
+			return; /* already linked */
+		smaller = smaller->next_local;
+	}
+
+	smaller->next_local = bigger;
+}
+
+static void do_subsidiary_caches_debugcheck(struct cache *cache)
+{
+	WARN_ON_ONCE(cache->level != 1);
+	WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
+}
+
+static void do_subsidiary_caches(struct cache *cache)
+{
+	struct device_node *subcache_node;
+	int level = cache->level;
+
+	do_subsidiary_caches_debugcheck(cache);
+
+	while ((subcache_node = of_find_next_cache_node(cache->ofnode))) {
+		struct cache *subcache;
+
+		level++;
+		subcache = cache_lookup_or_instantiate(subcache_node, level);
+		of_node_put(subcache_node);
+		if (!subcache)
+			break;
+
+		link_cache_lists(cache, subcache);
+		cache = subcache;
+	}
+}
+
+static struct cache *cache_chain_instantiate(unsigned int cpu_id)
+{
+	struct device_node *cpu_node;
+	struct cache *cpu_cache = NULL;
+
+	pr_debug("creating cache object(s) for CPU %i\n", cpu_id);
+
+	cpu_node = of_get_cpu_node(cpu_id, NULL);
+	WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id);
+	if (!cpu_node)
+		goto out;
+
+	cpu_cache = cache_lookup_or_instantiate(cpu_node, 1);
+	if (!cpu_cache)
+		goto out;
+
+	do_subsidiary_caches(cpu_cache);
+
+	cache_cpu_set(cpu_cache, cpu_id);
+out:
+	of_node_put(cpu_node);
+
+	return cpu_cache;
+}
+
+static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
+{
+	struct cache_dir *cache_dir;
+	struct device *dev;
+	struct kobject *kobj = NULL;
+
+	dev = get_cpu_device(cpu_id);
+	WARN_ONCE(!dev, "no dev for CPU %i\n", cpu_id);
+	if (!dev)
+		goto err;
+
+	kobj = kobject_create_and_add("cache", &dev->kobj);
+	if (!kobj)
+		goto err;
+
+	cache_dir = kzalloc(sizeof(*cache_dir), GFP_KERNEL);
+	if (!cache_dir)
+		goto err;
+
+	cache_dir->kobj = kobj;
+
+	WARN_ON_ONCE(per_cpu(cache_dir_pcpu, cpu_id) != NULL);
+
+	per_cpu(cache_dir_pcpu, cpu_id) = cache_dir;
+
+	return cache_dir;
+err:
+	kobject_put(kobj);
+	return NULL;
+}
+
+static void cache_index_release(struct kobject *kobj)
+{
+	struct cache_index_dir *index;
+
+	index = kobj_to_cache_index_dir(kobj);
+
+	pr_debug("freeing index directory for L%d %s cache\n",
+		 index->cache->level, cache_type_string(index->cache));
+
+	kfree(index);
+}
+
+static ssize_t cache_index_show(struct kobject *k, struct attribute *attr, char *buf)
+{
+	struct kobj_attribute *kobj_attr;
+
+	kobj_attr = container_of(attr, struct kobj_attribute, attr);
+
+	return kobj_attr->show(k, kobj_attr, buf);
+}
+
+static struct cache *index_kobj_to_cache(struct kobject *k)
+{
+	struct cache_index_dir *index;
+
+	index = kobj_to_cache_index_dir(k);
+
+	return index->cache;
+}
+
+static ssize_t size_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	unsigned int size_kb;
+	struct cache *cache;
+
+	cache = index_kobj_to_cache(k);
+
+	if (cache_size_kb(cache, &size_kb))
+		return -ENODEV;
+
+	return sprintf(buf, "%uK\n", size_kb);
+}
+
+static struct kobj_attribute cache_size_attr =
+	__ATTR(size, 0444, size_show, NULL);
+
+
+static ssize_t line_size_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	unsigned int line_size;
+	struct cache *cache;
+
+	cache = index_kobj_to_cache(k);
+
+	if (cache_get_line_size(cache, &line_size))
+		return -ENODEV;
+
+	return sprintf(buf, "%u\n", line_size);
+}
+
+static struct kobj_attribute cache_line_size_attr =
+	__ATTR(coherency_line_size, 0444, line_size_show, NULL);
+
+static ssize_t nr_sets_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	unsigned int nr_sets;
+	struct cache *cache;
+
+	cache = index_kobj_to_cache(k);
+
+	if (cache_nr_sets(cache, &nr_sets))
+		return -ENODEV;
+
+	return sprintf(buf, "%u\n", nr_sets);
+}
+
+static struct kobj_attribute cache_nr_sets_attr =
+	__ATTR(number_of_sets, 0444, nr_sets_show, NULL);
+
+static ssize_t associativity_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	unsigned int associativity;
+	struct cache *cache;
+
+	cache = index_kobj_to_cache(k);
+
+	if (cache_associativity(cache, &associativity))
+		return -ENODEV;
+
+	return sprintf(buf, "%u\n", associativity);
+}
+
+static struct kobj_attribute cache_assoc_attr =
+	__ATTR(ways_of_associativity, 0444, associativity_show, NULL);
+
+static ssize_t type_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	struct cache *cache;
+
+	cache = index_kobj_to_cache(k);
+
+	return sprintf(buf, "%s\n", cache_type_string(cache));
+}
+
+static struct kobj_attribute cache_type_attr =
+	__ATTR(type, 0444, type_show, NULL);
+
+static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	struct cache_index_dir *index;
+	struct cache *cache;
+
+	index = kobj_to_cache_index_dir(k);
+	cache = index->cache;
+
+	return sprintf(buf, "%d\n", cache->level);
+}
+
+static struct kobj_attribute cache_level_attr =
+	__ATTR(level, 0444, level_show, NULL);
+
+static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+	struct cache_index_dir *index;
+	struct cache *cache;
+	int ret;
+
+	index = kobj_to_cache_index_dir(k);
+	cache = index->cache;
+
+	ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb\n",
+			cpumask_pr_args(&cache->shared_cpu_map));
+	buf[ret++] = '\n';
+	buf[ret] = '\0';
+	return ret;
+}
+
+static struct kobj_attribute cache_shared_cpu_map_attr =
+	__ATTR(shared_cpu_map, 0444, shared_cpu_map_show, NULL);
+
+/* Attributes which should always be created -- the kobject/sysfs core
+ * does this automatically via kobj_type->default_attrs.  This is the
+ * minimum data required to uniquely identify a cache.
+ */
+static struct attribute *cache_index_default_attrs[] = {
+	&cache_type_attr.attr,
+	&cache_level_attr.attr,
+	&cache_shared_cpu_map_attr.attr,
+	NULL,
+};
+
+/* Attributes which should be created if the cache device node has the
+ * right properties -- see cacheinfo_create_index_opt_attrs
+ */
+static struct kobj_attribute *cache_index_opt_attrs[] = {
+	&cache_size_attr,
+	&cache_line_size_attr,
+	&cache_nr_sets_attr,
+	&cache_assoc_attr,
+};
+
+static const struct sysfs_ops cache_index_ops = {
+	.show = cache_index_show,
+};
+
+static struct kobj_type cache_index_type = {
+	.release = cache_index_release,
+	.sysfs_ops = &cache_index_ops,
+	.default_attrs = cache_index_default_attrs,
+};
+
+static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
+{
+	const char *cache_type;
+	struct cache *cache;
+	char *buf;
+	int i;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	cache = dir->cache;
+	cache_type = cache_type_string(cache);
+
+	/* We don't want to create an attribute that can't provide a
+	 * meaningful value.  Check the return value of each optional
+	 * attribute's ->show method before registering the
+	 * attribute.
+	 */
+	for (i = 0; i < ARRAY_SIZE(cache_index_opt_attrs); i++) {
+		struct kobj_attribute *attr;
+		ssize_t rc;
+
+		attr = cache_index_opt_attrs[i];
+
+		rc = attr->show(&dir->kobj, attr, buf);
+		if (rc <= 0) {
+			pr_debug("not creating %s attribute for "
+				 "%pOF(%s) (rc = %zd)\n",
+				 attr->attr.name, cache->ofnode,
+				 cache_type, rc);
+			continue;
+		}
+		if (sysfs_create_file(&dir->kobj, &attr->attr))
+			pr_debug("could not create %s attribute for %pOF(%s)\n",
+				 attr->attr.name, cache->ofnode, cache_type);
+	}
+
+	kfree(buf);
+}
+
+static void cacheinfo_create_index_dir(struct cache *cache, int index,
+				       struct cache_dir *cache_dir)
+{
+	struct cache_index_dir *index_dir;
+	int rc;
+
+	index_dir = kzalloc(sizeof(*index_dir), GFP_KERNEL);
+	if (!index_dir)
+		goto err;
+
+	index_dir->cache = cache;
+
+	rc = kobject_init_and_add(&index_dir->kobj, &cache_index_type,
+				  cache_dir->kobj, "index%d", index);
+	if (rc)
+		goto err;
+
+	index_dir->next = cache_dir->index;
+	cache_dir->index = index_dir;
+
+	cacheinfo_create_index_opt_attrs(index_dir);
+
+	return;
+err:
+	kfree(index_dir);
+}
+
+static void cacheinfo_sysfs_populate(unsigned int cpu_id,
+				     struct cache *cache_list)
+{
+	struct cache_dir *cache_dir;
+	struct cache *cache;
+	int index = 0;
+
+	cache_dir = cacheinfo_create_cache_dir(cpu_id);
+	if (!cache_dir)
+		return;
+
+	cache = cache_list;
+	while (cache) {
+		cacheinfo_create_index_dir(cache, index, cache_dir);
+		index++;
+		cache = cache->next_local;
+	}
+}
+
+void cacheinfo_cpu_online(unsigned int cpu_id)
+{
+	struct cache *cache;
+
+	cache = cache_chain_instantiate(cpu_id);
+	if (!cache)
+		return;
+
+	cacheinfo_sysfs_populate(cpu_id, cache);
+}
+
+/* functions needed to remove cache entry for cpu offline or suspend/resume */
+
+#if (defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SUSPEND)) || \
+    defined(CONFIG_HOTPLUG_CPU)
+
+static struct cache *cache_lookup_by_cpu(unsigned int cpu_id)
+{
+	struct device_node *cpu_node;
+	struct cache *cache;
+
+	cpu_node = of_get_cpu_node(cpu_id, NULL);
+	WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id);
+	if (!cpu_node)
+		return NULL;
+
+	cache = cache_lookup_by_node(cpu_node);
+	of_node_put(cpu_node);
+
+	return cache;
+}
+
+static void remove_index_dirs(struct cache_dir *cache_dir)
+{
+	struct cache_index_dir *index;
+
+	index = cache_dir->index;
+
+	while (index) {
+		struct cache_index_dir *next;
+
+		next = index->next;
+		kobject_put(&index->kobj);
+		index = next;
+	}
+}
+
+static void remove_cache_dir(struct cache_dir *cache_dir)
+{
+	remove_index_dirs(cache_dir);
+
+	/* Remove cache dir from sysfs */
+	kobject_del(cache_dir->kobj);
+
+	kobject_put(cache_dir->kobj);
+
+	kfree(cache_dir);
+}
+
+static void cache_cpu_clear(struct cache *cache, int cpu)
+{
+	while (cache) {
+		struct cache *next = cache->next_local;
+
+		WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map),
+			  "CPU %i not accounted in %pOF(%s)\n",
+			  cpu, cache->ofnode,
+			  cache_type_string(cache));
+
+		cpumask_clear_cpu(cpu, &cache->shared_cpu_map);
+
+		/* Release the cache object if all the cpus using it
+		 * are offline */
+		if (cpumask_empty(&cache->shared_cpu_map))
+			release_cache(cache);
+
+		cache = next;
+	}
+}
+
+void cacheinfo_cpu_offline(unsigned int cpu_id)
+{
+	struct cache_dir *cache_dir;
+	struct cache *cache;
+
+	/* Prevent userspace from seeing inconsistent state - remove
+	 * the sysfs hierarchy first */
+	cache_dir = per_cpu(cache_dir_pcpu, cpu_id);
+
+	/* careful, sysfs population may have failed */
+	if (cache_dir)
+		remove_cache_dir(cache_dir);
+
+	per_cpu(cache_dir_pcpu, cpu_id) = NULL;
+
+	/* clear the CPU's bit in its cache chain, possibly freeing
+	 * cache objects */
+	cache = cache_lookup_by_cpu(cpu_id);
+	if (cache)
+		cache_cpu_clear(cache, cpu_id);
+}
+
+void cacheinfo_teardown(void)
+{
+	unsigned int cpu;
+
+	lockdep_assert_cpus_held();
+
+	for_each_online_cpu(cpu)
+		cacheinfo_cpu_offline(cpu);
+}
+
+void cacheinfo_rebuild(void)
+{
+	unsigned int cpu;
+
+	lockdep_assert_cpus_held();
+
+	for_each_online_cpu(cpu)
+		cacheinfo_cpu_online(cpu);
+}
+
+#endif /* (CONFIG_PPC_PSERIES && CONFIG_SUSPEND) || CONFIG_HOTPLUG_CPU */
diff --git a/arch/powerpc/kernel/cacheinfo.h b/arch/powerpc/kernel/cacheinfo.h
new file mode 100644
index 000000000..52bd3fc66
--- /dev/null
+++ b/arch/powerpc/kernel/cacheinfo.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_CACHEINFO_H
+#define _PPC_CACHEINFO_H
+
+/* These are just hooks for sysfs.c to use. */
+extern void cacheinfo_cpu_online(unsigned int cpu_id);
+extern void cacheinfo_cpu_offline(unsigned int cpu_id);
+
+/* Allow migration/suspend to tear down and rebuild the hierarchy. */
+extern void cacheinfo_teardown(void);
+extern void cacheinfo_rebuild(void);
+
+#endif /* _PPC_CACHEINFO_H */
diff --git a/arch/powerpc/kernel/compat_audit.c b/arch/powerpc/kernel/compat_audit.c
new file mode 100644
index 000000000..55c6ccda0
--- /dev/null
+++ b/arch/powerpc/kernel/compat_audit.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef __powerpc64__
+#include <asm/unistd.h>
+
+unsigned ppc32_dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+unsigned ppc32_chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+unsigned ppc32_write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+unsigned ppc32_read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+unsigned ppc32_signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int ppc32_classify_syscall(unsigned syscall)
+{
+	switch(syscall) {
+	case __NR_open:
+		return 2;
+	case __NR_openat:
+		return 3;
+	case __NR_socketcall:
+		return 4;
+	case __NR_execve:
+		return 5;
+	default:
+		return 1;
+	}
+}
diff --git a/arch/powerpc/kernel/cpu_setup_44x.S b/arch/powerpc/kernel/cpu_setup_44x.S
new file mode 100644
index 000000000..e32b4a9a2
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_44x.S
@@ -0,0 +1,74 @@
+/*
+ * This file contains low level CPU setup functions.
+ * Valentine Barshak <vbarshak@ru.mvista.com>
+ * MontaVista Software, Inc (c) 2007
+ *
+ * Based on cpu_setup_6xx code by
+ * Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+
+_GLOBAL(__setup_cpu_440ep)
+	b	__init_fpu_44x
+_GLOBAL(__setup_cpu_440epx)
+	mflr	r4
+	bl	__init_fpu_44x
+	bl	__plb_disable_wrp
+	bl	__fixup_440A_mcheck
+	mtlr	r4
+	blr
+_GLOBAL(__setup_cpu_440grx)
+	mflr	r4
+	bl	__plb_disable_wrp
+	bl	__fixup_440A_mcheck
+	mtlr	r4
+	blr
+_GLOBAL(__setup_cpu_460ex)
+_GLOBAL(__setup_cpu_460gt)
+_GLOBAL(__setup_cpu_460sx)
+_GLOBAL(__setup_cpu_apm821xx)
+	mflr	r4
+	bl	__init_fpu_44x
+	bl	__fixup_440A_mcheck
+	mtlr	r4
+	blr
+
+_GLOBAL(__setup_cpu_440x5)
+_GLOBAL(__setup_cpu_440gx)
+_GLOBAL(__setup_cpu_440spe)
+	b	__fixup_440A_mcheck
+
+/* enable APU between CPU and FPU */
+_GLOBAL(__init_fpu_44x)
+	mfspr	r3,SPRN_CCR0
+	/* Clear DAPUIB flag in CCR0 */
+	rlwinm	r3,r3,0,12,10
+	mtspr	SPRN_CCR0,r3
+	isync
+	blr
+
+/*
+ * Workaround for the incorrect write to DDR SDRAM errata.
+ * The write address can be corrupted during writes to
+ * DDR SDRAM when write pipelining is enabled on PLB0.
+ * Disable write pipelining here.
+ */
+#define DCRN_PLB4A0_ACR	0x81
+
+_GLOBAL(__plb_disable_wrp)
+	mfdcr	r3,DCRN_PLB4A0_ACR
+	/* clear WRP bit in PLB4A0_ACR */
+	rlwinm	r3,r3,0,8,6
+	mtdcr	DCRN_PLB4A0_ACR,r3
+	isync
+	blr
+
diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S
new file mode 100644
index 000000000..fa3c2c912
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_6xx.S
@@ -0,0 +1,490 @@
+/*
+ * This file contains low level CPU setup functions.
+ *    Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+#include <asm/mmu.h>
+#include <asm/feature-fixups.h>
+
+_GLOBAL(__setup_cpu_603)
+	mflr	r5
+BEGIN_MMU_FTR_SECTION
+	li	r10,0
+	mtspr	SPRN_SPRG_603_LRU,r10		/* init SW LRU tracking */
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
+BEGIN_FTR_SECTION
+	bl	__init_fpu_registers
+END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE)
+	bl	setup_common_caches
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_604)
+	mflr	r5
+	bl	setup_common_caches
+	bl	setup_604_hid0
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_750)
+	mflr	r5
+	bl	__init_fpu_registers
+	bl	setup_common_caches
+	bl	setup_750_7400_hid0
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_750cx)
+	mflr	r5
+	bl	__init_fpu_registers
+	bl	setup_common_caches
+	bl	setup_750_7400_hid0
+	bl	setup_750cx
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_750fx)
+	mflr	r5
+	bl	__init_fpu_registers
+	bl	setup_common_caches
+	bl	setup_750_7400_hid0
+	bl	setup_750fx
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_7400)
+	mflr	r5
+	bl	__init_fpu_registers
+	bl	setup_7400_workarounds
+	bl	setup_common_caches
+	bl	setup_750_7400_hid0
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_7410)
+	mflr	r5
+	bl	__init_fpu_registers
+	bl	setup_7410_workarounds
+	bl	setup_common_caches
+	bl	setup_750_7400_hid0
+	li	r3,0
+	mtspr	SPRN_L2CR2,r3
+	mtlr	r5
+	blr
+_GLOBAL(__setup_cpu_745x)
+	mflr	r5
+	bl	setup_common_caches
+	bl	setup_745x_specifics
+	mtlr	r5
+	blr
+
+/* Enable caches for 603's, 604, 750 & 7400 */
+setup_common_caches:
+	mfspr	r11,SPRN_HID0
+	andi.	r0,r11,HID0_DCE
+	ori	r11,r11,HID0_ICE|HID0_DCE
+	ori	r8,r11,HID0_ICFI
+	bne	1f			/* don't invalidate the D-cache */
+	ori	r8,r8,HID0_DCI		/* unless it wasn't enabled */
+1:	sync
+	mtspr	SPRN_HID0,r8		/* enable and invalidate caches */
+	sync
+	mtspr	SPRN_HID0,r11		/* enable caches */
+	sync
+	isync
+	blr
+
+/* 604, 604e, 604ev, ...
+ * Enable superscalar execution & branch history table
+ */
+setup_604_hid0:
+	mfspr	r11,SPRN_HID0
+	ori	r11,r11,HID0_SIED|HID0_BHTE
+	ori	r8,r11,HID0_BTCD
+	sync
+	mtspr	SPRN_HID0,r8	/* flush branch target address cache */
+	sync			/* on 604e/604r */
+	mtspr	SPRN_HID0,r11
+	sync
+	isync
+	blr
+
+/* 7400 <= rev 2.7 and 7410 rev = 1.0 suffer from some
+ * erratas we work around here.
+ * Moto MPC710CE.pdf describes them, those are errata
+ * #3, #4 and #5
+ * Note that we assume the firmware didn't choose to
+ * apply other workarounds (there are other ones documented
+ * in the .pdf). It appear that Apple firmware only works
+ * around #3 and with the same fix we use. We may want to
+ * check if the CPU is using 60x bus mode in which case
+ * the workaround for errata #4 is useless. Also, we may
+ * want to explicitly clear HID0_NOPDST as this is not
+ * needed once we have applied workaround #5 (though it's
+ * not set by Apple's firmware at least).
+ */
+setup_7400_workarounds:
+	mfpvr	r3
+	rlwinm	r3,r3,0,20,31
+	cmpwi	0,r3,0x0207
+	ble	1f
+	blr
+setup_7410_workarounds:
+	mfpvr	r3
+	rlwinm	r3,r3,0,20,31
+	cmpwi	0,r3,0x0100
+	bnelr
+1:
+	mfspr	r11,SPRN_MSSSR0
+	/* Errata #3: Set L1OPQ_SIZE to 0x10 */
+	rlwinm	r11,r11,0,9,6
+	oris	r11,r11,0x0100
+	/* Errata #4: Set L2MQ_SIZE to 1 (check for MPX mode first ?) */
+	oris	r11,r11,0x0002
+	/* Errata #5: Set DRLT_SIZE to 0x01 */
+	rlwinm	r11,r11,0,5,2
+	oris	r11,r11,0x0800
+	sync
+	mtspr	SPRN_MSSSR0,r11
+	sync
+	isync
+	blr
+
+/* 740/750/7400/7410
+ * Enable Store Gathering (SGE), Address Broadcast (ABE),
+ * Branch History Table (BHTE), Branch Target ICache (BTIC)
+ * Dynamic Power Management (DPM), Speculative (SPD)
+ * Clear Instruction cache throttling (ICTC)
+ */
+setup_750_7400_hid0:
+	mfspr	r11,SPRN_HID0
+	ori	r11,r11,HID0_SGE | HID0_ABE | HID0_BHTE | HID0_BTIC
+	oris	r11,r11,HID0_DPM@h
+BEGIN_FTR_SECTION
+	xori	r11,r11,HID0_BTIC
+END_FTR_SECTION_IFSET(CPU_FTR_NO_BTIC)
+BEGIN_FTR_SECTION
+	xoris	r11,r11,HID0_DPM@h	/* disable dynamic power mgmt */
+END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM)
+	li	r3,HID0_SPD
+	andc	r11,r11,r3		/* clear SPD: enable speculative */
+ 	li	r3,0
+ 	mtspr	SPRN_ICTC,r3		/* Instruction Cache Throttling off */
+	isync
+	mtspr	SPRN_HID0,r11
+	sync
+	isync
+	blr
+
+/* 750cx specific
+ * Looks like we have to disable NAP feature for some PLL settings...
+ * (waiting for confirmation)
+ */
+setup_750cx:
+	mfspr	r10, SPRN_HID1
+	rlwinm	r10,r10,4,28,31
+	cmpwi	cr0,r10,7
+	cmpwi	cr1,r10,9
+	cmpwi	cr2,r10,11
+	cror	4*cr0+eq,4*cr0+eq,4*cr1+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr2+eq
+	bnelr
+	lwz	r6,CPU_SPEC_FEATURES(r4)
+	li	r7,CPU_FTR_CAN_NAP
+	andc	r6,r6,r7
+	stw	r6,CPU_SPEC_FEATURES(r4)
+	blr
+
+/* 750fx specific
+ */
+setup_750fx:
+	blr
+
+/* MPC 745x
+ * Enable Store Gathering (SGE), Branch Folding (FOLD)
+ * Branch History Table (BHTE), Branch Target ICache (BTIC)
+ * Dynamic Power Management (DPM), Speculative (SPD)
+ * Ensure our data cache instructions really operate.
+ * Timebase has to be running or we wouldn't have made it here,
+ * just ensure we don't disable it.
+ * Clear Instruction cache throttling (ICTC)
+ * Enable L2 HW prefetch
+ */
+setup_745x_specifics:
+	/* We check for the presence of an L3 cache setup by
+	 * the firmware. If any, we disable NAP capability as
+	 * it's known to be bogus on rev 2.1 and earlier
+	 */
+BEGIN_FTR_SECTION
+	mfspr	r11,SPRN_L3CR
+	andis.	r11,r11,L3CR_L3E@h
+	beq	1f
+END_FTR_SECTION_IFSET(CPU_FTR_L3CR)
+	lwz	r6,CPU_SPEC_FEATURES(r4)
+	andis.	r0,r6,CPU_FTR_L3_DISABLE_NAP@h
+	beq	1f
+	li	r7,CPU_FTR_CAN_NAP
+	andc	r6,r6,r7
+	stw	r6,CPU_SPEC_FEATURES(r4)
+1:
+	mfspr	r11,SPRN_HID0
+
+	/* All of the bits we have to set.....
+	 */
+	ori	r11,r11,HID0_SGE | HID0_FOLD | HID0_BHTE
+	ori	r11,r11,HID0_LRSTK | HID0_BTIC
+	oris	r11,r11,HID0_DPM@h
+BEGIN_MMU_FTR_SECTION
+	oris	r11,r11,HID0_HIGH_BAT@h
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+BEGIN_FTR_SECTION
+	xori	r11,r11,HID0_BTIC
+END_FTR_SECTION_IFSET(CPU_FTR_NO_BTIC)
+BEGIN_FTR_SECTION
+	xoris	r11,r11,HID0_DPM@h	/* disable dynamic power mgmt */
+END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM)
+
+	/* All of the bits we have to clear....
+	 */
+	li	r3,HID0_SPD | HID0_NOPDST | HID0_NOPTI
+	andc	r11,r11,r3		/* clear SPD: enable speculative */
+ 	li	r3,0
+
+ 	mtspr	SPRN_ICTC,r3		/* Instruction Cache Throttling off */
+	isync
+	mtspr	SPRN_HID0,r11
+	sync
+	isync
+
+	/* Enable L2 HW prefetch, if L2 is enabled
+	 */
+	mfspr	r3,SPRN_L2CR
+	andis.	r3,r3,L2CR_L2E@h
+	beqlr
+	mfspr	r3,SPRN_MSSCR0
+	ori	r3,r3,3
+	sync
+	mtspr	SPRN_MSSCR0,r3
+	sync
+	isync
+	blr
+
+/*
+ * Initialize the FPU registers. This is needed to work around an errata
+ * in some 750 cpus where using a not yet initialized FPU register after
+ * power on reset may hang the CPU
+ */
+_GLOBAL(__init_fpu_registers)
+	mfmsr	r10
+	ori	r11,r10,MSR_FP
+	mtmsr	r11
+	isync
+	addis	r9,r3,empty_zero_page@ha
+	addi	r9,r9,empty_zero_page@l
+	REST_32FPRS(0,r9)
+	sync
+	mtmsr	r10
+	isync
+	blr
+
+
+/* Definitions for the table use to save CPU states */
+#define CS_HID0		0
+#define CS_HID1		4
+#define CS_HID2		8
+#define	CS_MSSCR0	12
+#define CS_MSSSR0	16
+#define CS_ICTRL	20
+#define CS_LDSTCR	24
+#define CS_LDSTDB	28
+#define CS_SIZE		32
+
+	.data
+	.balign	L1_CACHE_BYTES
+cpu_state_storage:
+	.space	CS_SIZE
+	.balign	L1_CACHE_BYTES,0
+	.text
+
+/* Called in normal context to backup CPU 0 state. This
+ * does not include cache settings. This function is also
+ * called for machine sleep. This does not include the MMU
+ * setup, BATs, etc... but rather the "special" registers
+ * like HID0, HID1, MSSCR0, etc...
+ */
+_GLOBAL(__save_cpu_setup)
+	/* Some CR fields are volatile, we back it up all */
+	mfcr	r7
+
+	/* Get storage ptr */
+	lis	r5,cpu_state_storage@h
+	ori	r5,r5,cpu_state_storage@l
+
+	/* Save HID0 (common to all CONFIG_6xx cpus) */
+	mfspr	r3,SPRN_HID0
+	stw	r3,CS_HID0(r5)
+
+	/* Now deal with CPU type dependent registers */
+	mfspr	r3,SPRN_PVR
+	srwi	r3,r3,16
+	cmplwi	cr0,r3,0x8000	/* 7450 */
+	cmplwi	cr1,r3,0x000c	/* 7400 */
+	cmplwi	cr2,r3,0x800c	/* 7410 */
+	cmplwi	cr3,r3,0x8001	/* 7455 */
+	cmplwi	cr4,r3,0x8002	/* 7457 */
+	cmplwi	cr5,r3,0x8003	/* 7447A */
+	cmplwi	cr6,r3,0x7000	/* 750FX */
+	cmplwi	cr7,r3,0x8004	/* 7448 */
+	/* cr1 is 7400 || 7410 */
+	cror	4*cr1+eq,4*cr1+eq,4*cr2+eq
+	/* cr0 is 74xx */
+	cror	4*cr0+eq,4*cr0+eq,4*cr3+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr4+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr1+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr5+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr7+eq
+	bne	1f
+	/* Backup 74xx specific regs */
+	mfspr	r4,SPRN_MSSCR0
+	stw	r4,CS_MSSCR0(r5)
+	mfspr	r4,SPRN_MSSSR0
+	stw	r4,CS_MSSSR0(r5)
+	beq	cr1,1f
+	/* Backup 745x specific registers */
+	mfspr	r4,SPRN_HID1
+	stw	r4,CS_HID1(r5)
+	mfspr	r4,SPRN_ICTRL
+	stw	r4,CS_ICTRL(r5)
+	mfspr	r4,SPRN_LDSTCR
+	stw	r4,CS_LDSTCR(r5)
+	mfspr	r4,SPRN_LDSTDB
+	stw	r4,CS_LDSTDB(r5)
+1:
+	bne	cr6,1f
+	/* Backup 750FX specific registers */
+	mfspr	r4,SPRN_HID1
+	stw	r4,CS_HID1(r5)
+	/* If rev 2.x, backup HID2 */
+	mfspr	r3,SPRN_PVR
+	andi.	r3,r3,0xff00
+	cmpwi	cr0,r3,0x0200
+	bne	1f
+	mfspr	r4,SPRN_HID2
+	stw	r4,CS_HID2(r5)
+1:
+	mtcr	r7
+	blr
+
+/* Called with no MMU context (typically MSR:IR/DR off) to
+ * restore CPU state as backed up by the previous
+ * function. This does not include cache setting
+ */
+_GLOBAL(__restore_cpu_setup)
+	/* Some CR fields are volatile, we back it up all */
+	mfcr	r7
+
+	/* Get storage ptr */
+	lis	r5,(cpu_state_storage-KERNELBASE)@h
+	ori	r5,r5,cpu_state_storage@l
+
+	/* Restore HID0 */
+	lwz	r3,CS_HID0(r5)
+	sync
+	isync
+	mtspr	SPRN_HID0,r3
+	sync
+	isync
+
+	/* Now deal with CPU type dependent registers */
+	mfspr	r3,SPRN_PVR
+	srwi	r3,r3,16
+	cmplwi	cr0,r3,0x8000	/* 7450 */
+	cmplwi	cr1,r3,0x000c	/* 7400 */
+	cmplwi	cr2,r3,0x800c	/* 7410 */
+	cmplwi	cr3,r3,0x8001	/* 7455 */
+	cmplwi	cr4,r3,0x8002	/* 7457 */
+	cmplwi	cr5,r3,0x8003	/* 7447A */
+	cmplwi	cr6,r3,0x7000	/* 750FX */
+	cmplwi	cr7,r3,0x8004	/* 7448 */
+	/* cr1 is 7400 || 7410 */
+	cror	4*cr1+eq,4*cr1+eq,4*cr2+eq
+	/* cr0 is 74xx */
+	cror	4*cr0+eq,4*cr0+eq,4*cr3+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr4+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr1+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr5+eq
+	cror	4*cr0+eq,4*cr0+eq,4*cr7+eq
+	bne	2f
+	/* Restore 74xx specific regs */
+	lwz	r4,CS_MSSCR0(r5)
+	sync
+	mtspr	SPRN_MSSCR0,r4
+	sync
+	isync
+	lwz	r4,CS_MSSSR0(r5)
+	sync
+	mtspr	SPRN_MSSSR0,r4
+	sync
+	isync
+	bne	cr2,1f
+	/* Clear 7410 L2CR2 */
+	li	r4,0
+	mtspr	SPRN_L2CR2,r4
+1:	beq	cr1,2f
+	/* Restore 745x specific registers */
+	lwz	r4,CS_HID1(r5)
+	sync
+	mtspr	SPRN_HID1,r4
+	isync
+	sync
+	lwz	r4,CS_ICTRL(r5)
+	sync
+	mtspr	SPRN_ICTRL,r4
+	isync
+	sync
+	lwz	r4,CS_LDSTCR(r5)
+	sync
+	mtspr	SPRN_LDSTCR,r4
+	isync
+	sync
+	lwz	r4,CS_LDSTDB(r5)
+	sync
+	mtspr	SPRN_LDSTDB,r4
+	isync
+	sync
+2:	bne	cr6,1f
+	/* Restore 750FX specific registers
+	 * that is restore HID2 on rev 2.x and PLL config & switch
+	 * to PLL 0 on all
+	 */
+	/* If rev 2.x, restore HID2 with low voltage bit cleared */
+	mfspr	r3,SPRN_PVR
+	andi.	r3,r3,0xff00
+	cmpwi	cr0,r3,0x0200
+	bne	4f
+	lwz	r4,CS_HID2(r5)
+	rlwinm	r4,r4,0,19,17
+	mtspr	SPRN_HID2,r4
+	sync
+4:
+	lwz	r4,CS_HID1(r5)
+	rlwinm  r5,r4,0,16,14
+	mtspr	SPRN_HID1,r5
+		/* Wait for PLL to stabilize */
+	mftbl	r5
+3:	mftbl	r6
+	sub	r6,r6,r5
+	cmplwi	cr0,r6,10000
+	ble	3b
+	/* Setup final PLL */
+	mtspr	SPRN_HID1,r4
+1:
+	mtcr	r7
+	blr
+
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
new file mode 100644
index 000000000..8d142e5d8
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -0,0 +1,347 @@
+/*
+ * This file contains low level CPU setup functions.
+ * Kumar Gala <galak@kernel.crashing.org>
+ * Copyright 2009 Freescale Semiconductor, Inc.
+ *
+ * Based on cpu_setup_6xx code by
+ * Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/mmu-book3e.h>
+#include <asm/asm-offsets.h>
+#include <asm/mpc85xx.h>
+
+_GLOBAL(__e500_icache_setup)
+	mfspr	r0, SPRN_L1CSR1
+	andi.	r3, r0, L1CSR1_ICE
+	bnelr				/* Already enabled */
+	oris	r0, r0, L1CSR1_CPE@h
+	ori	r0, r0, (L1CSR1_ICFI | L1CSR1_ICLFR |  L1CSR1_ICE)
+	mtspr	SPRN_L1CSR1, r0		/* Enable I-Cache */
+	isync
+	blr
+
+_GLOBAL(__e500_dcache_setup)
+	mfspr	r0, SPRN_L1CSR0
+	andi.	r3, r0, L1CSR0_DCE
+	bnelr				/* Already enabled */
+	msync
+	isync
+	li	r0, 0
+	mtspr	SPRN_L1CSR0, r0		/* Disable */
+	msync
+	isync
+	li	r0, (L1CSR0_DCFI | L1CSR0_CLFC)
+	mtspr	SPRN_L1CSR0, r0		/* Invalidate */
+	isync
+1:	mfspr	r0, SPRN_L1CSR0
+	andi.	r3, r0, L1CSR0_CLFC
+	bne+	1b			/* Wait for lock bits reset */
+	oris	r0, r0, L1CSR0_CPE@h
+	ori	r0, r0, L1CSR0_DCE
+	msync
+	isync
+	mtspr	SPRN_L1CSR0, r0		/* Enable */
+	isync
+	blr
+
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for PW20_WAIT_IDLE_BIT.
+ */
+#define PW20_WAIT_IDLE_BIT		50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_pw20_idle)
+	mfspr	r3, SPRN_PWRMGTCR0
+
+	/* Set PW20_WAIT bit, enable pw20 state*/
+	ori	r3, r3, PWRMGTCR0_PW20_WAIT
+	li	r11, PW20_WAIT_IDLE_BIT
+
+	/* Set Automatic PW20 Core Idle Count */
+	rlwimi	r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT
+
+	mtspr	SPRN_PWRMGTCR0, r3
+
+	blr
+
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for AV_WAIT_IDLE_BIT.
+ */
+#define AV_WAIT_IDLE_BIT		50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_altivec_idle)
+	mfspr	r3, SPRN_PWRMGTCR0
+
+	/* Enable Altivec Idle */
+	oris	r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h
+	li	r11, AV_WAIT_IDLE_BIT
+
+	/* Set Automatic AltiVec Idle Count */
+	rlwimi	r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT
+
+	mtspr	SPRN_PWRMGTCR0, r3
+
+	blr
+
+#ifdef CONFIG_PPC_E500MC
+_GLOBAL(__setup_cpu_e6500)
+	mflr	r6
+#ifdef CONFIG_PPC64
+	bl	setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_lrat_ivor
+1:
+#endif
+	bl	setup_pw20_idle
+	bl	setup_altivec_idle
+	bl	__setup_cpu_e5500
+	mtlr	r6
+	blr
+#endif /* CONFIG_PPC_E500MC */
+
+#ifdef CONFIG_PPC32
+#ifdef CONFIG_E200
+_GLOBAL(__setup_cpu_e200)
+	/* enable dedicated debug exception handling resources (Debug APU) */
+	mfspr	r3,SPRN_HID0
+	ori	r3,r3,HID0_DAPUEN@l
+	mtspr	SPRN_HID0,r3
+	b	__setup_e200_ivors
+#endif /* CONFIG_E200 */
+
+#ifdef CONFIG_E500
+#ifndef CONFIG_PPC_E500MC
+_GLOBAL(__setup_cpu_e500v1)
+_GLOBAL(__setup_cpu_e500v2)
+	mflr	r4
+	bl	__e500_icache_setup
+	bl	__e500_dcache_setup
+	bl	__setup_e500_ivors
+#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI)
+	/* Ensure that RFXE is set */
+	mfspr	r3,SPRN_HID1
+	oris	r3,r3,HID1_RFXE@h
+	mtspr	SPRN_HID1,r3
+#endif
+	mtlr	r4
+	blr
+#else /* CONFIG_PPC_E500MC */
+_GLOBAL(__setup_cpu_e500mc)
+_GLOBAL(__setup_cpu_e5500)
+	mflr	r5
+	bl	__e500_icache_setup
+	bl	__e500_dcache_setup
+	bl	__setup_e500mc_ivors
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r3, SPRN_MMUCFG
+	rlwinm.	r3, r3, 0, MMUCFG_LPIDSIZE
+	beq	1f
+	bl	__setup_ehv_ivors
+	b	2f
+1:
+	lwz	r3, CPU_SPEC_FEATURES(r4)
+	/* We need this check as cpu_setup is also called for
+	 * the secondary cores. So, if we have already cleared
+	 * the feature on the primary core, avoid doing it on the
+	 * secondary core.
+	 */
+	andi.	r6, r3, CPU_FTR_EMB_HV
+	beq	2f
+	rlwinm	r3, r3, 0, ~CPU_FTR_EMB_HV
+	stw	r3, CPU_SPEC_FEATURES(r4)
+2:
+	mtlr	r5
+	blr
+#endif /* CONFIG_PPC_E500MC */
+#endif /* CONFIG_E500 */
+#endif /* CONFIG_PPC32 */
+
+#ifdef CONFIG_PPC_BOOK3E_64
+_GLOBAL(__restore_cpu_e6500)
+	mflr	r5
+	bl	setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_lrat_ivor
+1:
+	bl	setup_pw20_idle
+	bl	setup_altivec_idle
+	bl	__restore_cpu_e5500
+	mtlr	r5
+	blr
+
+_GLOBAL(__restore_cpu_e5500)
+	mflr	r4
+	bl	__e500_icache_setup
+	bl	__e500_dcache_setup
+	bl	__setup_base_ivors
+	bl	setup_perfmon_ivor
+	bl	setup_doorbell_ivors
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_ehv_ivors
+1:
+	mtlr	r4
+	blr
+
+_GLOBAL(__setup_cpu_e5500)
+	mflr	r5
+	bl	__e500_icache_setup
+	bl	__e500_dcache_setup
+	bl	__setup_base_ivors
+	bl	setup_perfmon_ivor
+	bl	setup_doorbell_ivors
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	setup_ehv_ivors
+	b	2f
+1:
+	ld	r10,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r9,CPU_FTR_EMB_HV)
+	andc	r10,r10,r9
+	std	r10,CPU_SPEC_FEATURES(r4)
+2:
+	mtlr	r5
+	blr
+#endif
+
+/* flush L1 date cache, it can apply to e500v2, e500mc and e5500 */
+_GLOBAL(flush_dcache_L1)
+	mfmsr	r10
+	wrteei	0
+
+	mfspr	r3,SPRN_L1CFG0
+	rlwinm	r5,r3,9,3	/* Extract cache block size */
+	twlgti	r5,1		/* Only 32 and 64 byte cache blocks
+				 * are currently defined.
+				 */
+	li	r4,32
+	subfic	r6,r5,2		/* r6 = log2(1KiB / cache block size) -
+				 *      log2(number of ways)
+				 */
+	slw	r5,r4,r5	/* r5 = cache block size */
+
+	rlwinm	r7,r3,0,0xff	/* Extract number of KiB in the cache */
+	mulli	r7,r7,13	/* An 8-way cache will require 13
+				 * loads per set.
+				 */
+	slw	r7,r7,r6
+
+	/* save off HID0 and set DCFA */
+	mfspr	r8,SPRN_HID0
+	ori	r9,r8,HID0_DCFA@l
+	mtspr	SPRN_HID0,r9
+	isync
+
+	LOAD_REG_IMMEDIATE(r6, KERNELBASE)
+	mr	r4, r6
+	mtctr	r7
+
+1:	lwz	r3,0(r4)	/* Load... */
+	add	r4,r4,r5
+	bdnz	1b
+
+	msync
+	mr	r4, r6
+	mtctr	r7
+
+1:	dcbf	0,r4		/* ...and flush. */
+	add	r4,r4,r5
+	bdnz	1b
+
+	/* restore HID0 */
+	mtspr	SPRN_HID0,r8
+	isync
+
+	wrtee r10
+
+	blr
+
+has_L2_cache:
+	/* skip L2 cache on P2040/P2040E as they have no L2 cache */
+	mfspr	r3, SPRN_SVR
+	/* shift right by 8 bits and clear E bit of SVR */
+	rlwinm	r4, r3, 24, ~0x800
+
+	lis	r3, SVR_P2040@h
+	ori	r3, r3, SVR_P2040@l
+	cmpw	r4, r3
+	beq	1f
+
+	li	r3, 1
+	blr
+1:
+	li	r3, 0
+	blr
+
+/* flush backside L2 cache */
+flush_backside_L2_cache:
+	mflr	r10
+	bl	has_L2_cache
+	mtlr	r10
+	cmpwi	r3, 0
+	beq	2f
+
+	/* Flush the L2 cache */
+	mfspr	r3, SPRN_L2CSR0
+	ori	r3, r3, L2CSR0_L2FL@l
+	msync
+	isync
+	mtspr	SPRN_L2CSR0,r3
+	isync
+
+	/* check if it is complete */
+1:	mfspr	r3,SPRN_L2CSR0
+	andi.	r3, r3, L2CSR0_L2FL@l
+	bne	1b
+2:
+	blr
+
+_GLOBAL(cpu_down_flush_e500v2)
+	mflr r0
+	bl	flush_dcache_L1
+	mtlr r0
+	blr
+
+_GLOBAL(cpu_down_flush_e500mc)
+_GLOBAL(cpu_down_flush_e5500)
+	mflr r0
+	bl	flush_dcache_L1
+	bl	flush_backside_L2_cache
+	mtlr r0
+	blr
+
+/* L1 Data Cache of e6500 contains no modified data, no flush is required */
+_GLOBAL(cpu_down_flush_e6500)
+	blr
diff --git a/arch/powerpc/kernel/cpu_setup_pa6t.S b/arch/powerpc/kernel/cpu_setup_pa6t.S
new file mode 100644
index 000000000..d62cb9cae
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_pa6t.S
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2006-2007 PA Semi, Inc
+ *
+ * Maintained by: Olof Johansson <olof@lixom.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+
+/* Right now, restore and setup are the same thing */
+_GLOBAL(__restore_cpu_pa6t)
+_GLOBAL(__setup_cpu_pa6t)
+	/* Do nothing if not running in HV mode */
+	mfmsr	r0
+	rldicl.	r0,r0,4,63
+	beqlr
+
+	mfspr	r0,SPRN_HID5
+	ori	r0,r0,0x38
+	mtspr	SPRN_HID5,r0
+
+	mfspr	r0,SPRN_LPCR
+	ori	r0,r0,0x7000
+	mtspr	SPRN_LPCR,r0
+
+	blr
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
new file mode 100644
index 000000000..5bef78e2b
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -0,0 +1,218 @@
+/*
+ * This file contains low level CPU setup functions.
+ *    Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+#include <asm/book3s/64/mmu-hash.h>
+
+/* Entry: r3 = crap, r4 = ptr to cputable entry
+ *
+ * Note that we can be called twice for pseudo-PVRs
+ */
+_GLOBAL(__setup_cpu_power7)
+	mflr	r11
+	bl	__init_hvmode_206
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mtspr	SPRN_PCR,r0
+	mfspr	r3,SPRN_LPCR
+	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
+	bl	__init_LPCR_ISA206
+	mtlr	r11
+	blr
+
+_GLOBAL(__restore_cpu_power7)
+	mflr	r11
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mtspr	SPRN_PCR,r0
+	mfspr	r3,SPRN_LPCR
+	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
+	bl	__init_LPCR_ISA206
+	mtlr	r11
+	blr
+
+_GLOBAL(__setup_cpu_power8)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_PMU
+	bl	__init_PMU_ISA207
+	bl	__init_hvmode_206
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mtspr	SPRN_PCR,r0
+	mfspr	r3,SPRN_LPCR
+	ori	r3, r3, LPCR_PECEDH
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA206
+	bl	__init_HFSCR
+	bl	__init_PMU_HV
+	bl	__init_PMU_HV_ISA207
+	mtlr	r11
+	blr
+
+_GLOBAL(__restore_cpu_power8)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_PMU
+	bl	__init_PMU_ISA207
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mtspr	SPRN_PCR,r0
+	mfspr   r3,SPRN_LPCR
+	ori	r3, r3, LPCR_PECEDH
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA206
+	bl	__init_HFSCR
+	bl	__init_PMU_HV
+	bl	__init_PMU_HV_ISA207
+	mtlr	r11
+	blr
+
+_GLOBAL(__setup_cpu_power9)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_PMU
+	bl	__init_hvmode_206
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_PSSCR,r0
+	mtspr	SPRN_LPID,r0
+	mtspr	SPRN_PID,r0
+	mtspr	SPRN_PCR,r0
+	mfspr	r3,SPRN_LPCR
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)
+	or	r3, r3, r4
+	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
+	andc	r3, r3, r4
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA300
+	bl	__init_HFSCR
+	bl	__init_PMU_HV
+	mtlr	r11
+	blr
+
+_GLOBAL(__restore_cpu_power9)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_PMU
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_PSSCR,r0
+	mtspr	SPRN_LPID,r0
+	mtspr	SPRN_PID,r0
+	mtspr	SPRN_PCR,r0
+	mfspr   r3,SPRN_LPCR
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
+	or	r3, r3, r4
+	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
+	andc	r3, r3, r4
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA300
+	bl	__init_HFSCR
+	bl	__init_PMU_HV
+	mtlr	r11
+	blr
+
+__init_hvmode_206:
+	/* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	bnelr
+	ld	r5,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+	xor	r5,r5,r6
+	std	r5,CPU_SPEC_FEATURES(r4)
+	blr
+
+__init_LPCR_ISA206:
+	/* Setup a sane LPCR:
+	 *   Called with initial LPCR in R3 and desired LPES 2-bit value in R4
+	 *
+	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
+	 *   PECE = 0b111
+	 *   DPFD = 4
+	 *   HDICE = 0
+	 *   VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
+	 *   VRMASD = 0b10000 (L=1, LP=00)
+	 *
+	 * Other bits untouched for now
+	 */
+	li	r5,0x10
+	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
+
+	/* POWER9 has no VRMASD */
+__init_LPCR_ISA300:
+	rldimi	r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
+	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
+	li	r5,4
+	rldimi	r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
+	clrrdi	r3,r3,1		/* clear HDICE */
+	li	r5,4
+	rldimi	r3,r5, LPCR_VC_SH, 0
+	mtspr	SPRN_LPCR,r3
+	isync
+	blr
+
+__init_FSCR:
+	mfspr	r3,SPRN_FSCR
+	ori	r3,r3,FSCR_TAR|FSCR_EBB
+	mtspr	SPRN_FSCR,r3
+	blr
+
+__init_HFSCR:
+	mfspr	r3,SPRN_HFSCR
+	ori	r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
+		      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP
+	mtspr	SPRN_HFSCR,r3
+	blr
+
+__init_PMU_HV:
+	li	r5,0
+	mtspr	SPRN_MMCRC,r5
+	blr
+
+__init_PMU_HV_ISA207:
+	li	r5,0
+	mtspr	SPRN_MMCRH,r5
+	blr
+
+__init_PMU:
+	li	r5,0
+	mtspr	SPRN_MMCRA,r5
+	mtspr	SPRN_MMCR0,r5
+	mtspr	SPRN_MMCR1,r5
+	mtspr	SPRN_MMCR2,r5
+	blr
+
+__init_PMU_ISA207:
+	li	r5,0
+	mtspr	SPRN_MMCRS,r5
+	blr
diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S
new file mode 100644
index 000000000..12fac8df0
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_ppc970.S
@@ -0,0 +1,210 @@
+/*
+ * This file contains low level CPU setup functions.
+ *    Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+
+_GLOBAL(__cpu_preinit_ppc970)
+	/* Do nothing if not running in HV mode */
+	mfmsr	r0
+	rldicl.	r0,r0,4,63
+	beqlr
+
+	/* Make sure HID4:rm_ci is off before MMU is turned off, that large
+	 * pages are enabled with HID4:61 and clear HID5:DCBZ_size and
+	 * HID5:DCBZ32_ill
+	 */
+	li	r0,0
+	mfspr	r3,SPRN_HID4
+	rldimi	r3,r0,40,23	/* clear bit 23 (rm_ci) */
+	rldimi	r3,r0,2,61	/* clear bit 61 (lg_pg_en) */
+	sync
+	mtspr	SPRN_HID4,r3
+	isync
+	sync
+	mfspr	r3,SPRN_HID5
+	rldimi	r3,r0,6,56	/* clear bits 56 & 57 (DCBZ*) */
+	sync
+	mtspr	SPRN_HID5,r3
+	isync
+	sync
+
+	/* Setup some basic HID1 features */
+	mfspr	r0,SPRN_HID1
+	li	r3,0x1200		/* enable i-fetch cacheability */
+	sldi	r3,r3,44		/* and prefetch */
+	or	r0,r0,r3
+	mtspr	SPRN_HID1,r0
+	mtspr	SPRN_HID1,r0
+	isync
+
+	/* Clear HIOR */
+	li	r0,0
+	sync
+	mtspr	SPRN_HIOR,0		/* Clear interrupt prefix */
+	isync
+	blr
+
+/* Definitions for the table use to save CPU states */
+#define CS_HID0		0
+#define CS_HID1		8
+#define	CS_HID4		16
+#define CS_HID5		24
+#define CS_SIZE		32
+
+	.data
+	.balign	L1_CACHE_BYTES,0
+cpu_state_storage:
+	.space	CS_SIZE
+	.balign	L1_CACHE_BYTES,0
+	.text
+
+
+_GLOBAL(__setup_cpu_ppc970)
+	/* Do nothing if not running in HV mode */
+	mfmsr	r0
+	rldicl.	r0,r0,4,63
+	beq	no_hv_mode
+
+	mfspr	r0,SPRN_HID0
+	li	r11,5			/* clear DOZE and SLEEP */
+	rldimi	r0,r11,52,8		/* set NAP and DPM */
+	li	r11,0
+	rldimi	r0,r11,32,31		/* clear EN_ATTN */
+	b	load_hids		/* Jump to shared code */
+
+
+_GLOBAL(__setup_cpu_ppc970MP)
+	/* Do nothing if not running in HV mode */
+	mfmsr	r0
+	rldicl.	r0,r0,4,63
+	beq	no_hv_mode
+
+	mfspr	r0,SPRN_HID0
+	li	r11,0x15		/* clear DOZE and SLEEP */
+	rldimi	r0,r11,52,6		/* set DEEPNAP, NAP and DPM */
+	li	r11,0
+	rldimi	r0,r11,32,31		/* clear EN_ATTN */
+
+load_hids:
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	sync
+	isync
+
+	/* Try to set LPES = 01 in HID4 */
+	mfspr	r0,SPRN_HID4
+	clrldi	r0,r0,1			/* clear LPES0 */
+	ori	r0,r0,HID4_LPES1	/* set LPES1 */
+	sync
+	mtspr	SPRN_HID4,r0
+	isync
+
+	/* Save away cpu state */
+	LOAD_REG_ADDR(r5,cpu_state_storage)
+
+	/* Save HID0,1,4 and 5 */
+	mfspr	r3,SPRN_HID0
+	std	r3,CS_HID0(r5)
+	mfspr	r3,SPRN_HID1
+	std	r3,CS_HID1(r5)
+	mfspr	r4,SPRN_HID4
+	std	r4,CS_HID4(r5)
+	mfspr	r3,SPRN_HID5
+	std	r3,CS_HID5(r5)
+
+	/* See if we successfully set LPES1 to 1; if not we are in Apple mode */
+	andi.	r4,r4,HID4_LPES1
+	bnelr
+
+no_hv_mode:
+	/* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
+	ld	r5,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+	andc	r5,r5,r6
+	std	r5,CPU_SPEC_FEATURES(r4)
+	blr
+
+/* Called with no MMU context (typically MSR:IR/DR off) to
+ * restore CPU state as backed up by the previous
+ * function. This does not include cache setting
+ */
+_GLOBAL(__restore_cpu_ppc970)
+	/* Do nothing if not running in HV mode */
+	mfmsr	r0
+	rldicl.	r0,r0,4,63
+	beqlr
+
+	LOAD_REG_ADDR(r5,cpu_state_storage)
+	/* Before accessing memory, we make sure rm_ci is clear */
+	li	r0,0
+	mfspr	r3,SPRN_HID4
+	rldimi	r3,r0,40,23	/* clear bit 23 (rm_ci) */
+	sync
+	mtspr	SPRN_HID4,r3
+	isync
+	sync
+
+	/* Clear interrupt prefix */
+	li	r0,0
+	sync
+	mtspr	SPRN_HIOR,0
+	isync
+
+	/* Restore HID0 */
+	ld	r3,CS_HID0(r5)
+	sync
+	isync
+	mtspr	SPRN_HID0,r3
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	mfspr	r3,SPRN_HID0
+	sync
+	isync
+
+	/* Restore HID1 */
+	ld	r3,CS_HID1(r5)
+	sync
+	isync
+	mtspr	SPRN_HID1,r3
+	mtspr	SPRN_HID1,r3
+	sync
+	isync
+
+	/* Restore HID4 */
+	ld	r3,CS_HID4(r5)
+	sync
+	isync
+	mtspr	SPRN_HID4,r3
+	sync
+	isync
+
+	/* Restore HID5 */
+	ld	r3,CS_HID5(r5)
+	sync
+	isync
+	mtspr	SPRN_HID5,r3
+	sync
+	isync
+	blr
+
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
new file mode 100644
index 000000000..a2183bb54
--- /dev/null
+++ b/arch/powerpc/kernel/cputable.c
@@ -0,0 +1,2299 @@
+/*
+ *  Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  Modifications for ppc64:
+ *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+
+#include <asm/oprofile_impl.h>
+#include <asm/cputable.h>
+#include <asm/prom.h>		/* for PTRRELOC on ARCH=ppc */
+#include <asm/mmu.h>
+#include <asm/setup.h>
+
+static struct cpu_spec the_cpu_spec __read_mostly;
+
+struct cpu_spec* cur_cpu_spec __read_mostly = NULL;
+EXPORT_SYMBOL(cur_cpu_spec);
+
+/* The platform string corresponding to the real PVR */
+const char *powerpc_base_platform;
+
+/* NOTE:
+ * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
+ * the responsibility of the appropriate CPU save/restore functions to
+ * eventually copy these settings over. Those save/restore aren't yet
+ * part of the cputable though. That has to be fixed for both ppc32
+ * and ppc64
+ */
+#ifdef CONFIG_PPC32
+extern void __setup_cpu_e200(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_e500v1(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_e500v2(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_e500mc(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_440ep(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_440epx(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_440gx(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_440grx(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_440spe(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_440x5(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_460ex(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_460gt(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_460sx(unsigned long offset, struct cpu_spec *spec);
+extern void __setup_cpu_apm821xx(unsigned long offset, struct cpu_spec *spec);
+extern void __setup_cpu_603(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_604(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_750(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_750cx(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_750fx(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_7400(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_7410(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec);
+#endif /* CONFIG_PPC32 */
+#ifdef CONFIG_PPC64
+extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_pa6t(void);
+extern void __restore_cpu_ppc970(void);
+extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power7(void);
+extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power8(void);
+extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power9(void);
+extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p9(struct pt_regs *regs);
+#endif /* CONFIG_PPC64 */
+#if defined(CONFIG_E500)
+extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_e5500(void);
+extern void __restore_cpu_e6500(void);
+#endif /* CONFIG_E500 */
+
+/* This table only contains "desktop" CPUs, it need to be filled with embedded
+ * ones as well...
+ */
+#define COMMON_USER		(PPC_FEATURE_32 | PPC_FEATURE_HAS_FPU | \
+				 PPC_FEATURE_HAS_MMU)
+#define COMMON_USER_PPC64	(COMMON_USER | PPC_FEATURE_64)
+#define COMMON_USER_POWER4	(COMMON_USER_PPC64 | PPC_FEATURE_POWER4)
+#define COMMON_USER_POWER5	(COMMON_USER_PPC64 | PPC_FEATURE_POWER5 |\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP)
+#define COMMON_USER_POWER5_PLUS	(COMMON_USER_PPC64 | PPC_FEATURE_POWER5_PLUS|\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP)
+#define COMMON_USER_POWER6	(COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_05 |\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
+				 PPC_FEATURE_TRUE_LE | \
+				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER_POWER7	(COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
+				 PPC_FEATURE_TRUE_LE | \
+				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER7	(PPC_FEATURE2_DSCR)
+#define COMMON_USER_POWER8	(COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\
+				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
+				 PPC_FEATURE_TRUE_LE | \
+				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER8	(PPC_FEATURE2_ARCH_2_07 | \
+				 PPC_FEATURE2_HTM_COMP | \
+				 PPC_FEATURE2_HTM_NOSC_COMP | \
+				 PPC_FEATURE2_DSCR | \
+				 PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \
+				 PPC_FEATURE2_VEC_CRYPTO)
+#define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
+				 PPC_FEATURE_TRUE_LE | \
+				 PPC_FEATURE_HAS_ALTIVEC_COMP)
+#define COMMON_USER_POWER9	COMMON_USER_POWER8
+#define COMMON_USER2_POWER9	(COMMON_USER2_POWER8 | \
+				 PPC_FEATURE2_ARCH_3_00 | \
+				 PPC_FEATURE2_HAS_IEEE128 | \
+				 PPC_FEATURE2_DARN )
+
+#ifdef CONFIG_PPC_BOOK3E_64
+#define COMMON_USER_BOOKE	(COMMON_USER_PPC64 | PPC_FEATURE_BOOKE)
+#else
+#define COMMON_USER_BOOKE	(PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \
+				 PPC_FEATURE_BOOKE)
+#endif
+
+static struct cpu_spec __initdata cpu_specs[] = {
+#ifdef CONFIG_PPC_BOOK3S_64
+	{	/* PPC970 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00390000,
+		.cpu_name		= "PPC970",
+		.cpu_features		= CPU_FTRS_PPC970,
+		.cpu_user_features	= COMMON_USER_POWER4 |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTRS_PPC970,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 8,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_ppc970,
+		.cpu_restore		= __restore_cpu_ppc970,
+		.oprofile_cpu_type	= "ppc64/970",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "ppc970",
+	},
+	{	/* PPC970FX */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x003c0000,
+		.cpu_name		= "PPC970FX",
+		.cpu_features		= CPU_FTRS_PPC970,
+		.cpu_user_features	= COMMON_USER_POWER4 |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTRS_PPC970,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 8,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_ppc970,
+		.cpu_restore		= __restore_cpu_ppc970,
+		.oprofile_cpu_type	= "ppc64/970",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "ppc970",
+	},
+	{	/* PPC970MP DD1.0 - no DEEPNAP, use regular 970 init */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x00440100,
+		.cpu_name		= "PPC970MP",
+		.cpu_features		= CPU_FTRS_PPC970,
+		.cpu_user_features	= COMMON_USER_POWER4 |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTRS_PPC970,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 8,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_ppc970,
+		.cpu_restore		= __restore_cpu_ppc970,
+		.oprofile_cpu_type	= "ppc64/970MP",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "ppc970",
+	},
+	{	/* PPC970MP */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00440000,
+		.cpu_name		= "PPC970MP",
+		.cpu_features		= CPU_FTRS_PPC970,
+		.cpu_user_features	= COMMON_USER_POWER4 |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTRS_PPC970,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 8,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_ppc970MP,
+		.cpu_restore		= __restore_cpu_ppc970,
+		.oprofile_cpu_type	= "ppc64/970MP",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "ppc970",
+	},
+	{	/* PPC970GX */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00450000,
+		.cpu_name		= "PPC970GX",
+		.cpu_features		= CPU_FTRS_PPC970,
+		.cpu_user_features	= COMMON_USER_POWER4 |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.mmu_features		= MMU_FTRS_PPC970,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 8,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_ppc970,
+		.oprofile_cpu_type	= "ppc64/970",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "ppc970",
+	},
+	{	/* Power5 GR */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x003a0000,
+		.cpu_name		= "POWER5 (gr)",
+		.cpu_features		= CPU_FTRS_POWER5,
+		.cpu_user_features	= COMMON_USER_POWER5,
+		.mmu_features		= MMU_FTRS_POWER5,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power5",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		/* SIHV / SIPR bits are implemented on POWER4+ (GQ)
+		 * and above but only works on POWER5 and above
+		 */
+		.oprofile_mmcra_sihv	= MMCRA_SIHV,
+		.oprofile_mmcra_sipr	= MMCRA_SIPR,
+		.platform		= "power5",
+	},
+	{	/* Power5++ */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x003b0300,
+		.cpu_name		= "POWER5+ (gs)",
+		.cpu_features		= CPU_FTRS_POWER5,
+		.cpu_user_features	= COMMON_USER_POWER5_PLUS,
+		.mmu_features		= MMU_FTRS_POWER5,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.oprofile_cpu_type	= "ppc64/power5++",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.oprofile_mmcra_sihv	= MMCRA_SIHV,
+		.oprofile_mmcra_sipr	= MMCRA_SIPR,
+		.platform		= "power5+",
+	},
+	{	/* Power5 GS */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x003b0000,
+		.cpu_name		= "POWER5+ (gs)",
+		.cpu_features		= CPU_FTRS_POWER5,
+		.cpu_user_features	= COMMON_USER_POWER5_PLUS,
+		.mmu_features		= MMU_FTRS_POWER5,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power5+",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.oprofile_mmcra_sihv	= MMCRA_SIHV,
+		.oprofile_mmcra_sipr	= MMCRA_SIPR,
+		.platform		= "power5+",
+	},
+	{	/* POWER6 in P5+ mode; 2.04-compliant processor */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000001,
+		.cpu_name		= "POWER5+",
+		.cpu_features		= CPU_FTRS_POWER5,
+		.cpu_user_features	= COMMON_USER_POWER5_PLUS,
+		.mmu_features		= MMU_FTRS_POWER5,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "power5+",
+	},
+	{	/* Power6 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x003e0000,
+		.cpu_name		= "POWER6 (raw)",
+		.cpu_features		= CPU_FTRS_POWER6,
+		.cpu_user_features	= COMMON_USER_POWER6 |
+			PPC_FEATURE_POWER6_EXT,
+		.mmu_features		= MMU_FTRS_POWER6,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power6",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.oprofile_mmcra_sihv	= POWER6_MMCRA_SIHV,
+		.oprofile_mmcra_sipr	= POWER6_MMCRA_SIPR,
+		.oprofile_mmcra_clear	= POWER6_MMCRA_THRM |
+			POWER6_MMCRA_OTHER,
+		.platform		= "power6x",
+	},
+	{	/* 2.05-compliant processor, i.e. Power6 "architected" mode */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000002,
+		.cpu_name		= "POWER6 (architected)",
+		.cpu_features		= CPU_FTRS_POWER6,
+		.cpu_user_features	= COMMON_USER_POWER6,
+		.mmu_features		= MMU_FTRS_POWER6,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.platform		= "power6",
+	},
+	{	/* 2.06-compliant processor, i.e. Power7 "architected" mode */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000003,
+		.cpu_name		= "POWER7 (architected)",
+		.cpu_features		= CPU_FTRS_POWER7,
+		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
+		.mmu_features		= MMU_FTRS_POWER7,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.cpu_setup		= __setup_cpu_power7,
+		.cpu_restore		= __restore_cpu_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
+		.platform		= "power7",
+	},
+	{	/* 2.07-compliant processor, i.e. Power8 "architected" mode */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000004,
+		.cpu_name		= "POWER8 (architected)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
+	{	/* 3.00-compliant processor, i.e. Power9 "architected" mode */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x0f000005,
+		.cpu_name		= "POWER9 (architected)",
+		.cpu_features		= CPU_FTRS_POWER9,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.platform		= "power9",
+	},
+	{	/* Power7 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x003f0000,
+		.cpu_name		= "POWER7 (raw)",
+		.cpu_features		= CPU_FTRS_POWER7,
+		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
+		.mmu_features		= MMU_FTRS_POWER7,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power7",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.cpu_setup		= __setup_cpu_power7,
+		.cpu_restore		= __restore_cpu_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
+		.platform		= "power7",
+	},
+	{	/* Power7+ */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004A0000,
+		.cpu_name		= "POWER7+ (raw)",
+		.cpu_features		= CPU_FTRS_POWER7,
+		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
+		.mmu_features		= MMU_FTRS_POWER7,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power7",
+		.oprofile_type		= PPC_OPROFILE_POWER4,
+		.cpu_setup		= __setup_cpu_power7,
+		.cpu_restore		= __restore_cpu_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
+		.platform		= "power7+",
+	},
+	{	/* Power8E */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004b0000,
+		.cpu_name		= "POWER8E (raw)",
+		.cpu_features		= CPU_FTRS_POWER8E,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
+	{	/* Power8NVL */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004c0000,
+		.cpu_name		= "POWER8NVL (raw)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
+	{	/* Power8 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004d0000,
+		.cpu_name		= "POWER8 (raw)",
+		.cpu_features		= CPU_FTRS_POWER8,
+		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
+		.mmu_features		= MMU_FTRS_POWER8,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power8",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power8,
+		.cpu_restore		= __restore_cpu_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
+		.platform		= "power8",
+	},
+	{	/* Power9 DD2.0 */
+		.pvr_mask		= 0xffffefff,
+		.pvr_value		= 0x004e0200,
+		.cpu_name		= "POWER9 (raw)",
+		.cpu_features		= CPU_FTRS_POWER9_DD2_0,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power9",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.machine_check_early	= __machine_check_early_realmode_p9,
+		.platform		= "power9",
+	},
+	{	/* Power9 DD 2.1 */
+		.pvr_mask		= 0xffffefff,
+		.pvr_value		= 0x004e0201,
+		.cpu_name		= "POWER9 (raw)",
+		.cpu_features		= CPU_FTRS_POWER9_DD2_1,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power9",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.machine_check_early	= __machine_check_early_realmode_p9,
+		.platform		= "power9",
+	},
+	{	/* Power9 DD2.2 or later */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004e0000,
+		.cpu_name		= "POWER9 (raw)",
+		.cpu_features		= CPU_FTRS_POWER9_DD2_2,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power9",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.machine_check_early	= __machine_check_early_realmode_p9,
+		.platform		= "power9",
+	},
+	{	/* Cell Broadband Engine */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00700000,
+		.cpu_name		= "Cell Broadband Engine",
+		.cpu_features		= CPU_FTRS_CELL,
+		.cpu_user_features	= COMMON_USER_PPC64 |
+			PPC_FEATURE_CELL | PPC_FEATURE_HAS_ALTIVEC_COMP |
+			PPC_FEATURE_SMT,
+		.mmu_features		= MMU_FTRS_CELL,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/cell-be",
+		.oprofile_type		= PPC_OPROFILE_CELL,
+		.platform		= "ppc-cell-be",
+	},
+	{	/* PA Semi PA6T */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00900000,
+		.cpu_name		= "PA6T",
+		.cpu_features		= CPU_FTRS_PA6T,
+		.cpu_user_features	= COMMON_USER_PA6T,
+		.mmu_features		= MMU_FTRS_PA6T,
+		.icache_bsize		= 64,
+		.dcache_bsize		= 64,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_PA6T,
+		.cpu_setup		= __setup_cpu_pa6t,
+		.cpu_restore		= __restore_cpu_pa6t,
+		.oprofile_cpu_type	= "ppc64/pa6t",
+		.oprofile_type		= PPC_OPROFILE_PA6T,
+		.platform		= "pa6t",
+	},
+	{	/* default match */
+		.pvr_mask		= 0x00000000,
+		.pvr_value		= 0x00000000,
+		.cpu_name		= "POWER5 (compatible)",
+		.cpu_features		= CPU_FTRS_COMPATIBLE,
+		.cpu_user_features	= COMMON_USER_PPC64,
+		.mmu_features		= MMU_FTRS_POWER,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.platform		= "power5",
+	}
+#endif	/* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC32
+#ifdef CONFIG_PPC_BOOK3S_32
+	{	/* 601 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00010000,
+		.cpu_name		= "601",
+		.cpu_features		= CPU_FTRS_PPC601,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_601_INSTR |
+			PPC_FEATURE_UNIFIED_CACHE | PPC_FEATURE_NO_TB,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc601",
+	},
+	{	/* 603 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00030000,
+		.cpu_name		= "603",
+		.cpu_features		= CPU_FTRS_603,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= 0,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc603",
+	},
+	{	/* 603e */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00060000,
+		.cpu_name		= "603e",
+		.cpu_features		= CPU_FTRS_603,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= 0,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc603",
+	},
+	{	/* 603ev */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00070000,
+		.cpu_name		= "603ev",
+		.cpu_features		= CPU_FTRS_603,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= 0,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc603",
+	},
+	{	/* 604 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00040000,
+		.cpu_name		= "604",
+		.cpu_features		= CPU_FTRS_604,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 2,
+		.cpu_setup		= __setup_cpu_604,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc604",
+	},
+	{	/* 604e */
+		.pvr_mask		= 0xfffff000,
+		.pvr_value		= 0x00090000,
+		.cpu_name		= "604e",
+		.cpu_features		= CPU_FTRS_604,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.cpu_setup		= __setup_cpu_604,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc604",
+	},
+	{	/* 604r */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00090000,
+		.cpu_name		= "604r",
+		.cpu_features		= CPU_FTRS_604,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.cpu_setup		= __setup_cpu_604,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc604",
+	},
+	{	/* 604ev */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x000a0000,
+		.cpu_name		= "604ev",
+		.cpu_features		= CPU_FTRS_604,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.cpu_setup		= __setup_cpu_604,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc604",
+	},
+	{	/* 740/750 (0x4202, don't support TAU ?) */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x00084202,
+		.cpu_name		= "740/750",
+		.cpu_features		= CPU_FTRS_740_NOTAU,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.cpu_setup		= __setup_cpu_750,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 750CX (80100 and 8010x?) */
+		.pvr_mask		= 0xfffffff0,
+		.pvr_value		= 0x00080100,
+		.cpu_name		= "750CX",
+		.cpu_features		= CPU_FTRS_750,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.cpu_setup		= __setup_cpu_750cx,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 750CX (82201 and 82202) */
+		.pvr_mask		= 0xfffffff0,
+		.pvr_value		= 0x00082200,
+		.cpu_name		= "750CX",
+		.cpu_features		= CPU_FTRS_750,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750cx,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 750CXe (82214) */
+		.pvr_mask		= 0xfffffff0,
+		.pvr_value		= 0x00082210,
+		.cpu_name		= "750CXe",
+		.cpu_features		= CPU_FTRS_750,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750cx,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 750CXe "Gekko" (83214) */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x00083214,
+		.cpu_name		= "750CXe",
+		.cpu_features		= CPU_FTRS_750,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750cx,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 750CL (and "Broadway") */
+		.pvr_mask		= 0xfffff0e0,
+		.pvr_value		= 0x00087000,
+		.cpu_name		= "750CL",
+		.cpu_features		= CPU_FTRS_750CL,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+		.oprofile_cpu_type      = "ppc/750",
+		.oprofile_type		= PPC_OPROFILE_G4,
+	},
+	{	/* 745/755 */
+		.pvr_mask		= 0xfffff000,
+		.pvr_value		= 0x00083000,
+		.cpu_name		= "745/755",
+		.cpu_features		= CPU_FTRS_750,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 750FX rev 1.x */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x70000100,
+		.cpu_name		= "750FX",
+		.cpu_features		= CPU_FTRS_750FX1,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+		.oprofile_cpu_type      = "ppc/750",
+		.oprofile_type		= PPC_OPROFILE_G4,
+	},
+	{	/* 750FX rev 2.0 must disable HID0[DPM] */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x70000200,
+		.cpu_name		= "750FX",
+		.cpu_features		= CPU_FTRS_750FX2,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+		.oprofile_cpu_type      = "ppc/750",
+		.oprofile_type		= PPC_OPROFILE_G4,
+	},
+	{	/* 750FX (All revs except 2.0) */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x70000000,
+		.cpu_name		= "750FX",
+		.cpu_features		= CPU_FTRS_750FX,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750fx,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+		.oprofile_cpu_type      = "ppc/750",
+		.oprofile_type		= PPC_OPROFILE_G4,
+	},
+	{	/* 750GX */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x70020000,
+		.cpu_name		= "750GX",
+		.cpu_features		= CPU_FTRS_750GX,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750fx,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+		.oprofile_cpu_type      = "ppc/750",
+		.oprofile_type		= PPC_OPROFILE_G4,
+	},
+	{	/* 740/750 (L2CR bit need fixup for 740) */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00080000,
+		.cpu_name		= "740/750",
+		.cpu_features		= CPU_FTRS_740,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_IBM,
+		.cpu_setup		= __setup_cpu_750,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc750",
+	},
+	{	/* 7400 rev 1.1 ? (no TAU) */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x000c1101,
+		.cpu_name		= "7400 (1.1)",
+		.cpu_features		= CPU_FTRS_7400_NOTAU,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_7400,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7400",
+	},
+	{	/* 7400 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x000c0000,
+		.cpu_name		= "7400",
+		.cpu_features		= CPU_FTRS_7400,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_7400,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7400",
+	},
+	{	/* 7410 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x800c0000,
+		.cpu_name		= "7410",
+		.cpu_features		= CPU_FTRS_7400,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_7410,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7400",
+	},
+	{	/* 7450 2.0 - no doze/nap */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x80000200,
+		.cpu_name		= "7450",
+		.cpu_features		= CPU_FTRS_7450_20,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7450 2.1 */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x80000201,
+		.cpu_name		= "7450",
+		.cpu_features		= CPU_FTRS_7450_21,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7450 2.3 and newer */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80000000,
+		.cpu_name		= "7450",
+		.cpu_features		= CPU_FTRS_7450_23,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7455 rev 1.x */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x80010100,
+		.cpu_name		= "7455",
+		.cpu_features		= CPU_FTRS_7455_1,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7455 rev 2.0 */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x80010200,
+		.cpu_name		= "7455",
+		.cpu_features		= CPU_FTRS_7455_20,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7455 others */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80010000,
+		.cpu_name		= "7455",
+		.cpu_features		= CPU_FTRS_7455,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7447/7457 Rev 1.0 */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x80020100,
+		.cpu_name		= "7447/7457",
+		.cpu_features		= CPU_FTRS_7447_10,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7447/7457 Rev 1.1 */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x80020101,
+		.cpu_name		= "7447/7457",
+		.cpu_features		= CPU_FTRS_7447_10,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7447/7457 Rev 1.2 and later */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80020000,
+		.cpu_name		= "7447/7457",
+		.cpu_features		= CPU_FTRS_7447,
+		.cpu_user_features	= COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7447A */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80030000,
+		.cpu_name		= "7447A",
+		.cpu_features		= CPU_FTRS_7447A,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 7448 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80040000,
+		.cpu_name		= "7448",
+		.cpu_features		= CPU_FTRS_7448,
+		.cpu_user_features	= COMMON_USER |
+			PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE,
+		.mmu_features		= MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_G4,
+		.cpu_setup		= __setup_cpu_745x,
+		.oprofile_cpu_type      = "ppc/7450",
+		.oprofile_type		= PPC_OPROFILE_G4,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc7450",
+	},
+	{	/* 82xx (8240, 8245, 8260 are all 603e cores) */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00810000,
+		.cpu_name		= "82xx",
+		.cpu_features		= CPU_FTRS_82XX,
+		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= 0,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc603",
+	},
+	{	/* All G2_LE (603e core, plus some) have the same pvr */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00820000,
+		.cpu_name		= "G2_LE",
+		.cpu_features		= CPU_FTRS_G2_LE,
+		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc603",
+	},
+#ifdef CONFIG_PPC_83xx
+	{	/* e300c1 (a 603e core, plus some) on 83xx */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00830000,
+		.cpu_name		= "e300c1",
+		.cpu_features		= CPU_FTRS_E300,
+		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_83xx,
+		.platform		= "ppc603",
+	},
+	{	/* e300c2 (an e300c1 core, plus some, minus FPU) on 83xx */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00840000,
+		.cpu_name		= "e300c2",
+		.cpu_features		= CPU_FTRS_E300C2,
+		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS |
+			MMU_FTR_NEED_DTLB_SW_LRU,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_83xx,
+		.platform		= "ppc603",
+	},
+	{	/* e300c3 (e300c1, plus one IU, half cache size) on 83xx */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00850000,
+		.cpu_name		= "e300c3",
+		.cpu_features		= CPU_FTRS_E300,
+		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS |
+			MMU_FTR_NEED_DTLB_SW_LRU,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_83xx,
+		.num_pmcs		= 4,
+		.oprofile_cpu_type	= "ppc/e300",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.platform		= "ppc603",
+	},
+	{	/* e300c4 (e300c1, plus one IU) */
+		.pvr_mask		= 0x7fff0000,
+		.pvr_value		= 0x00860000,
+		.cpu_name		= "e300c4",
+		.cpu_features		= CPU_FTRS_E300,
+		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_USE_HIGH_BATS |
+			MMU_FTR_NEED_DTLB_SW_LRU,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_603,
+		.machine_check		= machine_check_83xx,
+		.num_pmcs		= 4,
+		.oprofile_cpu_type	= "ppc/e300",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.platform		= "ppc603",
+	},
+#endif
+	{	/* default match, we assume split I/D cache & TB (non-601)... */
+		.pvr_mask		= 0x00000000,
+		.pvr_value		= 0x00000000,
+		.cpu_name		= "(generic PPC)",
+		.cpu_features		= CPU_FTRS_CLASSIC32,
+		.cpu_user_features	= COMMON_USER,
+		.mmu_features		= MMU_FTR_HPTE_TABLE,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_generic,
+		.platform		= "ppc603",
+	},
+#endif /* CONFIG_PPC_BOOK3S_32 */
+#ifdef CONFIG_PPC_8xx
+	{	/* 8xx */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= PVR_8xx,
+		.cpu_name		= "8xx",
+		/* CPU_FTR_MAYBE_CAN_DOZE is possible,
+		 * if the 8xx code is there.... */
+		.cpu_features		= CPU_FTRS_8XX,
+		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_8xx,
+		.icache_bsize		= 16,
+		.dcache_bsize		= 16,
+		.machine_check		= machine_check_8xx,
+		.platform		= "ppc823",
+	},
+#endif /* CONFIG_PPC_8xx */
+#ifdef CONFIG_40x
+	{	/* 403GC */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x00200200,
+		.cpu_name		= "403GC",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 16,
+		.dcache_bsize		= 16,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc403",
+	},
+	{	/* 403GCX */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x00201400,
+		.cpu_name		= "403GCX",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+		 	PPC_FEATURE_HAS_MMU | PPC_FEATURE_NO_TB,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 16,
+		.dcache_bsize		= 16,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc403",
+	},
+	{	/* 403G ?? */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00200000,
+		.cpu_name		= "403G ??",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 16,
+		.dcache_bsize		= 16,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc403",
+	},
+	{	/* 405GP */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x40110000,
+		.cpu_name		= "405GP",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* STB 03xxx */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x40130000,
+		.cpu_name		= "STB03xxx",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* STB 04xxx */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x41810000,
+		.cpu_name		= "STB04xxx",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* NP405L */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x41610000,
+		.cpu_name		= "NP405L",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* NP4GS3 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x40B10000,
+		.cpu_name		= "NP4GS3",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{   /* NP405H */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x41410000,
+		.cpu_name		= "NP405H",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405GPr */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x50910000,
+		.cpu_name		= "405GPr",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{   /* STBx25xx */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x51510000,
+		.cpu_name		= "STBx25xx",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405LP */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x41F10000,
+		.cpu_name		= "405LP",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* Xilinx Virtex-II Pro  */
+		.pvr_mask		= 0xfffff000,
+		.pvr_value		= 0x20010000,
+		.cpu_name		= "Virtex-II Pro",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* Xilinx Virtex-4 FX */
+		.pvr_mask		= 0xfffff000,
+		.pvr_value		= 0x20011000,
+		.cpu_name		= "Virtex-4 FX",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EP */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x51210000,
+		.cpu_name		= "405EP",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EX Rev. A/B with Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910007,
+		.cpu_name		= "405EX Rev. A/B",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EX Rev. C without Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x1291000d,
+		.cpu_name		= "405EX Rev. C",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EX Rev. C with Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x1291000f,
+		.cpu_name		= "405EX Rev. C",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EX Rev. D without Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910003,
+		.cpu_name		= "405EX Rev. D",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EX Rev. D with Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910005,
+		.cpu_name		= "405EX Rev. D",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EXr Rev. A/B without Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910001,
+		.cpu_name		= "405EXr Rev. A/B",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EXr Rev. C without Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910009,
+		.cpu_name		= "405EXr Rev. C",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EXr Rev. C with Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x1291000b,
+		.cpu_name		= "405EXr Rev. C",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EXr Rev. D without Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910000,
+		.cpu_name		= "405EXr Rev. D",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* 405EXr Rev. D with Security */
+		.pvr_mask		= 0xffff000f,
+		.pvr_value		= 0x12910002,
+		.cpu_name		= "405EXr Rev. D",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{
+		/* 405EZ */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x41510000,
+		.cpu_name		= "405EZ",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* APM8018X */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x7ff11432,
+		.cpu_name		= "APM8018X",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	},
+	{	/* default match */
+		.pvr_mask		= 0x00000000,
+		.pvr_value		= 0x00000000,
+		.cpu_name		= "(generic 40x PPC)",
+		.cpu_features		= CPU_FTRS_40X,
+		.cpu_user_features	= PPC_FEATURE_32 |
+			PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+		.mmu_features		= MMU_FTR_TYPE_40x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc405",
+	}
+
+#endif /* CONFIG_40x */
+#ifdef CONFIG_44x
+	{
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x40000850,
+		.cpu_name		= "440GR Rev. A",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	},
+	{ /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x40000858,
+		.cpu_name		= "440EP Rev. A",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440ep,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	},
+	{
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x400008d3,
+		.cpu_name		= "440GR Rev. B",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	},
+	{ /* Matches both physical and logical PVR for 440EP (logical pvr = pvr | 0x8) */
+		.pvr_mask		= 0xf0000ff7,
+		.pvr_value		= 0x400008d4,
+		.cpu_name		= "440EP Rev. C",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440ep,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	},
+	{ /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x400008db,
+		.cpu_name		= "440EP Rev. B",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440ep,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	},
+	{ /* 440GRX */
+		.pvr_mask		= 0xf0000ffb,
+		.pvr_value		= 0x200008D0,
+		.cpu_name		= "440GRX",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440grx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* Use logical PVR for 440EPx (logical pvr = pvr | 0x8) */
+		.pvr_mask		= 0xf0000ffb,
+		.pvr_value		= 0x200008D8,
+		.cpu_name		= "440EPX",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440epx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{	/* 440GP Rev. B */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x40000440,
+		.cpu_name		= "440GP Rev. B",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440gp",
+	},
+	{	/* 440GP Rev. C */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x40000481,
+		.cpu_name		= "440GP Rev. C",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440gp",
+	},
+	{ /* 440GX Rev. A */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x50000850,
+		.cpu_name		= "440GX Rev. A",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440gx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 440GX Rev. B */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x50000851,
+		.cpu_name		= "440GX Rev. B",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440gx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 440GX Rev. C */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x50000892,
+		.cpu_name		= "440GX Rev. C",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440gx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 440GX Rev. F */
+		.pvr_mask		= 0xf0000fff,
+		.pvr_value		= 0x50000894,
+		.cpu_name		= "440GX Rev. F",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440gx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 440SP Rev. A */
+		.pvr_mask		= 0xfff00fff,
+		.pvr_value		= 0x53200891,
+		.cpu_name		= "440SP Rev. A",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	},
+	{ /* 440SPe Rev. A */
+		.pvr_mask               = 0xfff00fff,
+		.pvr_value              = 0x53400890,
+		.cpu_name               = "440SPe Rev. A",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features      = COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize           = 32,
+		.dcache_bsize           = 32,
+		.cpu_setup		= __setup_cpu_440spe,
+		.machine_check		= machine_check_440A,
+		.platform               = "ppc440",
+	},
+	{ /* 440SPe Rev. B */
+		.pvr_mask		= 0xfff00fff,
+		.pvr_value		= 0x53400891,
+		.cpu_name		= "440SPe Rev. B",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440spe,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 440 in Xilinx Virtex-5 FXT */
+		.pvr_mask		= 0xfffffff0,
+		.pvr_value		= 0x7ff21910,
+		.cpu_name		= "440 in Virtex-5 FXT",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_440x5,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 460EX */
+		.pvr_mask		= 0xffff0006,
+		.pvr_value		= 0x13020002,
+		.cpu_name		= "460EX",
+		.cpu_features		= CPU_FTRS_440x6,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_460ex,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 460EX Rev B */
+		.pvr_mask		= 0xffff0007,
+		.pvr_value		= 0x13020004,
+		.cpu_name		= "460EX Rev. B",
+		.cpu_features		= CPU_FTRS_440x6,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_460ex,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 460GT */
+		.pvr_mask		= 0xffff0006,
+		.pvr_value		= 0x13020000,
+		.cpu_name		= "460GT",
+		.cpu_features		= CPU_FTRS_440x6,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_460gt,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 460GT Rev B */
+		.pvr_mask		= 0xffff0007,
+		.pvr_value		= 0x13020005,
+		.cpu_name		= "460GT Rev. B",
+		.cpu_features		= CPU_FTRS_440x6,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_460gt,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 460SX */
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x13541800,
+		.cpu_name		= "460SX",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_460sx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+	{ /* 464 in APM821xx */
+		.pvr_mask		= 0xfffffff0,
+		.pvr_value		= 0x12C41C80,
+		.cpu_name		= "APM821XX",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_apm821xx,
+		.machine_check		= machine_check_440A,
+		.platform		= "ppc440",
+	},
+#ifdef CONFIG_PPC_47x
+	{ /* 476 DD2 core */
+		.pvr_mask		= 0xffffffff,
+		.pvr_value		= 0x11a52080,
+		.cpu_name		= "476",
+		.cpu_features		= CPU_FTRS_47X | CPU_FTR_476_DD2,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_47x |
+			MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 128,
+		.machine_check		= machine_check_47x,
+		.platform		= "ppc470",
+	},
+	{ /* 476fpe */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x7ff50000,
+		.cpu_name		= "476fpe",
+		.cpu_features		= CPU_FTRS_47X | CPU_FTR_476_DD2,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_47x |
+			MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 128,
+		.machine_check		= machine_check_47x,
+		.platform		= "ppc470",
+	},
+	{ /* 476 iss */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00050000,
+		.cpu_name		= "476",
+		.cpu_features		= CPU_FTRS_47X,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_47x |
+			MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 128,
+		.machine_check		= machine_check_47x,
+		.platform		= "ppc470",
+	},
+	{ /* 476 others */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x11a50000,
+		.cpu_name		= "476",
+		.cpu_features		= CPU_FTRS_47X,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_FPU,
+		.mmu_features		= MMU_FTR_TYPE_47x |
+			MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 128,
+		.machine_check		= machine_check_47x,
+		.platform		= "ppc470",
+	},
+#endif /* CONFIG_PPC_47x */
+	{	/* default match */
+		.pvr_mask		= 0x00000000,
+		.pvr_value		= 0x00000000,
+		.cpu_name		= "(generic 44x PPC)",
+		.cpu_features		= CPU_FTRS_44X,
+		.cpu_user_features	= COMMON_USER_BOOKE,
+		.mmu_features		= MMU_FTR_TYPE_44x,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_4xx,
+		.platform		= "ppc440",
+	}
+#endif /* CONFIG_44x */
+#ifdef CONFIG_E200
+	{	/* e200z5 */
+		.pvr_mask		= 0xfff00000,
+		.pvr_value		= 0x81000000,
+		.cpu_name		= "e200z5",
+		/* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */
+		.cpu_features		= CPU_FTRS_E200,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_EFP_SINGLE |
+			PPC_FEATURE_UNIFIED_CACHE,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_e200,
+		.platform		= "ppc5554",
+	},
+	{	/* e200z6 */
+		.pvr_mask		= 0xfff00000,
+		.pvr_value		= 0x81100000,
+		.cpu_name		= "e200z6",
+		/* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */
+		.cpu_features		= CPU_FTRS_E200,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_SPE_COMP |
+			PPC_FEATURE_HAS_EFP_SINGLE_COMP |
+			PPC_FEATURE_UNIFIED_CACHE,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_e200,
+		.platform		= "ppc5554",
+	},
+	{	/* default match */
+		.pvr_mask		= 0x00000000,
+		.pvr_value		= 0x00000000,
+		.cpu_name		= "(generic E200 PPC)",
+		.cpu_features		= CPU_FTRS_E200,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_EFP_SINGLE |
+			PPC_FEATURE_UNIFIED_CACHE,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
+		.dcache_bsize		= 32,
+		.cpu_setup		= __setup_cpu_e200,
+		.machine_check		= machine_check_e200,
+		.platform		= "ppc5554",
+	}
+#endif /* CONFIG_E200 */
+#endif /* CONFIG_PPC32 */
+#ifdef CONFIG_E500
+#ifdef CONFIG_PPC32
+#ifndef CONFIG_PPC_E500MC
+	{	/* e500 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80200000,
+		.cpu_name		= "e500",
+		.cpu_features		= CPU_FTRS_E500,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_SPE_COMP |
+			PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.oprofile_cpu_type	= "ppc/e500",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.cpu_setup		= __setup_cpu_e500v1,
+		.machine_check		= machine_check_e500,
+		.platform		= "ppc8540",
+	},
+	{	/* e500v2 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80210000,
+		.cpu_name		= "e500v2",
+		.cpu_features		= CPU_FTRS_E500_2,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_SPE_COMP |
+			PPC_FEATURE_HAS_EFP_SINGLE_COMP |
+			PPC_FEATURE_HAS_EFP_DOUBLE_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.num_pmcs		= 4,
+		.oprofile_cpu_type	= "ppc/e500",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.cpu_setup		= __setup_cpu_e500v2,
+		.machine_check		= machine_check_e500,
+		.platform		= "ppc8548",
+		.cpu_down_flush		= cpu_down_flush_e500v2,
+	},
+#else
+	{	/* e500mc */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80230000,
+		.cpu_name		= "e500mc",
+		.cpu_features		= CPU_FTRS_E500MC,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
+			MMU_FTR_USE_TLBILX,
+		.icache_bsize		= 64,
+		.dcache_bsize		= 64,
+		.num_pmcs		= 4,
+		.oprofile_cpu_type	= "ppc/e500mc",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.cpu_setup		= __setup_cpu_e500mc,
+		.machine_check		= machine_check_e500mc,
+		.platform		= "ppce500mc",
+		.cpu_down_flush		= cpu_down_flush_e500mc,
+	},
+#endif /* CONFIG_PPC_E500MC */
+#endif /* CONFIG_PPC32 */
+#ifdef CONFIG_PPC_E500MC
+	{	/* e5500 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80240000,
+		.cpu_name		= "e5500",
+		.cpu_features		= CPU_FTRS_E5500,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
+			MMU_FTR_USE_TLBILX,
+		.icache_bsize		= 64,
+		.dcache_bsize		= 64,
+		.num_pmcs		= 4,
+		.oprofile_cpu_type	= "ppc/e500mc",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.cpu_setup		= __setup_cpu_e5500,
+#ifndef CONFIG_PPC32
+		.cpu_restore		= __restore_cpu_e5500,
+#endif
+		.machine_check		= machine_check_e500mc,
+		.platform		= "ppce5500",
+		.cpu_down_flush		= cpu_down_flush_e5500,
+	},
+	{	/* e6500 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x80400000,
+		.cpu_name		= "e6500",
+		.cpu_features		= CPU_FTRS_E6500,
+		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU |
+			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
+			MMU_FTR_USE_TLBILX,
+		.icache_bsize		= 64,
+		.dcache_bsize		= 64,
+		.num_pmcs		= 6,
+		.oprofile_cpu_type	= "ppc/e6500",
+		.oprofile_type		= PPC_OPROFILE_FSL_EMB,
+		.cpu_setup		= __setup_cpu_e6500,
+#ifndef CONFIG_PPC32
+		.cpu_restore		= __restore_cpu_e6500,
+#endif
+		.machine_check		= machine_check_e500mc,
+		.platform		= "ppce6500",
+		.cpu_down_flush		= cpu_down_flush_e6500,
+	},
+#endif /* CONFIG_PPC_E500MC */
+#ifdef CONFIG_PPC32
+	{	/* default match */
+		.pvr_mask		= 0x00000000,
+		.pvr_value		= 0x00000000,
+		.cpu_name		= "(generic E500 PPC)",
+		.cpu_features		= CPU_FTRS_E500,
+		.cpu_user_features	= COMMON_USER_BOOKE |
+			PPC_FEATURE_HAS_SPE_COMP |
+			PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+		.mmu_features		= MMU_FTR_TYPE_FSL_E,
+		.icache_bsize		= 32,
+		.dcache_bsize		= 32,
+		.machine_check		= machine_check_e500,
+		.platform		= "powerpc",
+	}
+#endif /* CONFIG_PPC32 */
+#endif /* CONFIG_E500 */
+};
+
+void __init set_cur_cpu_spec(struct cpu_spec *s)
+{
+	struct cpu_spec *t = &the_cpu_spec;
+
+	t = PTRRELOC(t);
+	*t = *s;
+
+	*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+}
+
+static struct cpu_spec * __init setup_cpu_spec(unsigned long offset,
+					       struct cpu_spec *s)
+{
+	struct cpu_spec *t = &the_cpu_spec;
+	struct cpu_spec old;
+
+	t = PTRRELOC(t);
+	old = *t;
+
+	/* Copy everything, then do fixups */
+	*t = *s;
+
+	/*
+	 * If we are overriding a previous value derived from the real
+	 * PVR with a new value obtained using a logical PVR value,
+	 * don't modify the performance monitor fields.
+	 */
+	if (old.num_pmcs && !s->num_pmcs) {
+		t->num_pmcs = old.num_pmcs;
+		t->pmc_type = old.pmc_type;
+		t->oprofile_type = old.oprofile_type;
+		t->oprofile_mmcra_sihv = old.oprofile_mmcra_sihv;
+		t->oprofile_mmcra_sipr = old.oprofile_mmcra_sipr;
+		t->oprofile_mmcra_clear = old.oprofile_mmcra_clear;
+
+		/*
+		 * If we have passed through this logic once before and
+		 * have pulled the default case because the real PVR was
+		 * not found inside cpu_specs[], then we are possibly
+		 * running in compatibility mode. In that case, let the
+		 * oprofiler know which set of compatibility counters to
+		 * pull from by making sure the oprofile_cpu_type string
+		 * is set to that of compatibility mode. If the
+		 * oprofile_cpu_type already has a value, then we are
+		 * possibly overriding a real PVR with a logical one,
+		 * and, in that case, keep the current value for
+		 * oprofile_cpu_type. Futhermore, let's ensure that the
+		 * fix for the PMAO bug is enabled on compatibility mode.
+		 */
+		if (old.oprofile_cpu_type != NULL) {
+			t->oprofile_cpu_type = old.oprofile_cpu_type;
+			t->oprofile_type = old.oprofile_type;
+			t->cpu_features |= old.cpu_features & CPU_FTR_PMAO_BUG;
+		}
+	}
+
+	*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
+
+	/*
+	 * Set the base platform string once; assumes
+	 * we're called with real pvr first.
+	 */
+	if (*PTRRELOC(&powerpc_base_platform) == NULL)
+		*PTRRELOC(&powerpc_base_platform) = t->platform;
+
+#if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
+	/* ppc64 and booke expect identify_cpu to also call setup_cpu for
+	 * that processor. I will consolidate that at a later time, for now,
+	 * just use #ifdef. We also don't need to PTRRELOC the function
+	 * pointer on ppc64 and booke as we are running at 0 in real mode
+	 * on ppc64 and reloc_offset is always 0 on booke.
+	 */
+	if (t->cpu_setup) {
+		t->cpu_setup(offset, t);
+	}
+#endif /* CONFIG_PPC64 || CONFIG_BOOKE */
+
+	return t;
+}
+
+struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
+{
+	struct cpu_spec *s = cpu_specs;
+	int i;
+
+	s = PTRRELOC(s);
+
+	for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) {
+		if ((pvr & s->pvr_mask) == s->pvr_value)
+			return setup_cpu_spec(offset, s);
+	}
+
+	BUG();
+
+	return NULL;
+}
+
+/*
+ * Used by cpufeatures to get the name for CPUs with a PVR table.
+ * If they don't hae a PVR table, cpufeatures gets the name from
+ * cpu device-tree node.
+ */
+void __init identify_cpu_name(unsigned int pvr)
+{
+	struct cpu_spec *s = cpu_specs;
+	struct cpu_spec *t = &the_cpu_spec;
+	int i;
+
+	s = PTRRELOC(s);
+	t = PTRRELOC(t);
+
+	for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) {
+		if ((pvr & s->pvr_mask) == s->pvr_value) {
+			t->cpu_name = s->cpu_name;
+			return;
+		}
+	}
+}
+
+
+#ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
+struct static_key_true cpu_feature_keys[NUM_CPU_FTR_KEYS] = {
+			[0 ... NUM_CPU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT
+};
+EXPORT_SYMBOL_GPL(cpu_feature_keys);
+
+void __init cpu_feature_keys_init(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU_FTR_KEYS; i++) {
+		unsigned long f = 1ul << i;
+
+		if (!(cur_cpu_spec->cpu_features & f))
+			static_branch_disable(&cpu_feature_keys[i]);
+	}
+}
+
+struct static_key_true mmu_feature_keys[NUM_MMU_FTR_KEYS] = {
+			[0 ... NUM_MMU_FTR_KEYS - 1] = STATIC_KEY_TRUE_INIT
+};
+EXPORT_SYMBOL_GPL(mmu_feature_keys);
+
+void __init mmu_feature_keys_init(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_MMU_FTR_KEYS; i++) {
+		unsigned long f = 1ul << i;
+
+		if (!(cur_cpu_spec->mmu_features & f))
+			static_branch_disable(&mmu_feature_keys[i]);
+	}
+}
+#endif
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
new file mode 100644
index 000000000..43a3ce230
--- /dev/null
+++ b/arch/powerpc/kernel/crash.c
@@ -0,0 +1,377 @@
+/*
+ * Architecture specific (PPC64) functions for kexec based crash dumps.
+ *
+ * Copyright (C) 2005, IBM Corp.
+ *
+ * Created by: Haren Myneni
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/export.h>
+#include <linux/crash_dump.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/machdep.h>
+#include <asm/kexec.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/setjmp.h>
+#include <asm/debug.h>
+
+/*
+ * The primary CPU waits a while for all secondary CPUs to enter. This is to
+ * avoid sending an IPI if the secondary CPUs are entering
+ * crash_kexec_secondary on their own (eg via a system reset).
+ *
+ * The secondary timeout has to be longer than the primary. Both timeouts are
+ * in milliseconds.
+ */
+#define PRIMARY_TIMEOUT		500
+#define SECONDARY_TIMEOUT	1000
+
+#define IPI_TIMEOUT		10000
+#define REAL_MODE_TIMEOUT	10000
+
+static int time_to_dump;
+/*
+ * crash_wake_offline should be set to 1 by platforms that intend to wake
+ * up offline cpus prior to jumping to a kdump kernel. Currently powernv
+ * sets it to 1, since we want to avoid things from happening when an
+ * offline CPU wakes up due to something like an HMI (malfunction error),
+ * which propagates to all threads.
+ */
+int crash_wake_offline;
+
+#define CRASH_HANDLER_MAX 3
+/* List of shutdown handles */
+static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX];
+static DEFINE_SPINLOCK(crash_handlers_lock);
+
+static unsigned long crash_shutdown_buf[JMP_BUF_LEN];
+static int crash_shutdown_cpu = -1;
+
+static int handle_fault(struct pt_regs *regs)
+{
+	if (crash_shutdown_cpu == smp_processor_id())
+		longjmp(crash_shutdown_buf, 1);
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+
+static atomic_t cpus_in_crash;
+void crash_ipi_callback(struct pt_regs *regs)
+{
+	static cpumask_t cpus_state_saved = CPU_MASK_NONE;
+
+	int cpu = smp_processor_id();
+
+	hard_irq_disable();
+	if (!cpumask_test_cpu(cpu, &cpus_state_saved)) {
+		crash_save_cpu(regs, cpu);
+		cpumask_set_cpu(cpu, &cpus_state_saved);
+	}
+
+	atomic_inc(&cpus_in_crash);
+	smp_mb__after_atomic();
+
+	/*
+	 * Starting the kdump boot.
+	 * This barrier is needed to make sure that all CPUs are stopped.
+	 */
+	while (!time_to_dump)
+		cpu_relax();
+
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(1, 1);
+
+#ifdef CONFIG_PPC64
+	kexec_smp_wait();
+#else
+	for (;;);	/* FIXME */
+#endif
+
+	/* NOTREACHED */
+}
+
+static void crash_kexec_prepare_cpus(int cpu)
+{
+	unsigned int msecs;
+	unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
+	int tries = 0;
+	int (*old_handler)(struct pt_regs *regs);
+
+	printk(KERN_EMERG "Sending IPI to other CPUs\n");
+
+	if (crash_wake_offline)
+		ncpus = num_present_cpus() - 1;
+
+	crash_send_ipi(crash_ipi_callback);
+	smp_wmb();
+
+again:
+	/*
+	 * FIXME: Until we will have the way to stop other CPUs reliably,
+	 * the crash CPU will send an IPI and wait for other CPUs to
+	 * respond.
+	 */
+	msecs = IPI_TIMEOUT;
+	while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0))
+		mdelay(1);
+
+	/* Would it be better to replace the trap vector here? */
+
+	if (atomic_read(&cpus_in_crash) >= ncpus) {
+		printk(KERN_EMERG "IPI complete\n");
+		return;
+	}
+
+	printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n",
+		ncpus - atomic_read(&cpus_in_crash));
+
+	/*
+	 * If we have a panic timeout set then we can't wait indefinitely
+	 * for someone to activate system reset. We also give up on the
+	 * second time through if system reset fail to work.
+	 */
+	if ((panic_timeout > 0) || (tries > 0))
+		return;
+
+	/*
+	 * A system reset will cause all CPUs to take an 0x100 exception.
+	 * The primary CPU returns here via setjmp, and the secondary
+	 * CPUs reexecute the crash_kexec_secondary path.
+	 */
+	old_handler = __debugger;
+	__debugger = handle_fault;
+	crash_shutdown_cpu = smp_processor_id();
+
+	if (setjmp(crash_shutdown_buf) == 0) {
+		printk(KERN_EMERG "Activate system reset (dumprestart) "
+				  "to stop other cpu(s)\n");
+
+		/*
+		 * A system reset will force all CPUs to execute the
+		 * crash code again. We need to reset cpus_in_crash so we
+		 * wait for everyone to do this.
+		 */
+		atomic_set(&cpus_in_crash, 0);
+		smp_mb();
+
+		while (atomic_read(&cpus_in_crash) < ncpus)
+			cpu_relax();
+	}
+
+	crash_shutdown_cpu = -1;
+	__debugger = old_handler;
+
+	tries++;
+	goto again;
+}
+
+/*
+ * This function will be called by secondary cpus.
+ */
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+	unsigned long flags;
+	int msecs = SECONDARY_TIMEOUT;
+
+	local_irq_save(flags);
+
+	/* Wait for the primary crash CPU to signal its progress */
+	while (crashing_cpu < 0) {
+		if (--msecs < 0) {
+			/* No response, kdump image may not have been loaded */
+			local_irq_restore(flags);
+			return;
+		}
+
+		mdelay(1);
+	}
+
+	crash_ipi_callback(regs);
+}
+
+#else	/* ! CONFIG_SMP */
+
+static void crash_kexec_prepare_cpus(int cpu)
+{
+	/*
+	 * move the secondaries to us so that we can copy
+	 * the new kernel 0-0x100 safely
+	 *
+	 * do this if kexec in setup.c ?
+	 */
+#ifdef CONFIG_PPC64
+	smp_release_cpus();
+#else
+	/* FIXME */
+#endif
+}
+
+void crash_kexec_secondary(struct pt_regs *regs)
+{
+}
+#endif	/* CONFIG_SMP */
+
+/* wait for all the CPUs to hit real mode but timeout if they don't come in */
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
+static void __maybe_unused crash_kexec_wait_realmode(int cpu)
+{
+	unsigned int msecs;
+	int i;
+
+	msecs = REAL_MODE_TIMEOUT;
+	for (i=0; i < nr_cpu_ids && msecs > 0; i++) {
+		if (i == cpu)
+			continue;
+
+		while (paca_ptrs[i]->kexec_state < KEXEC_STATE_REAL_MODE) {
+			barrier();
+			if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0))
+				break;
+			msecs--;
+			mdelay(1);
+		}
+	}
+	mb();
+}
+#else
+static inline void crash_kexec_wait_realmode(int cpu) {}
+#endif	/* CONFIG_SMP && CONFIG_PPC64 */
+
+/*
+ * Register a function to be called on shutdown.  Only use this if you
+ * can't reset your device in the second kernel.
+ */
+int crash_shutdown_register(crash_shutdown_t handler)
+{
+	unsigned int i, rc;
+
+	spin_lock(&crash_handlers_lock);
+	for (i = 0 ; i < CRASH_HANDLER_MAX; i++)
+		if (!crash_shutdown_handles[i]) {
+			/* Insert handle at first empty entry */
+			crash_shutdown_handles[i] = handler;
+			rc = 0;
+			break;
+		}
+
+	if (i == CRASH_HANDLER_MAX) {
+		printk(KERN_ERR "Crash shutdown handles full, "
+		       "not registered.\n");
+		rc = 1;
+	}
+
+	spin_unlock(&crash_handlers_lock);
+	return rc;
+}
+EXPORT_SYMBOL(crash_shutdown_register);
+
+int crash_shutdown_unregister(crash_shutdown_t handler)
+{
+	unsigned int i, rc;
+
+	spin_lock(&crash_handlers_lock);
+	for (i = 0 ; i < CRASH_HANDLER_MAX; i++)
+		if (crash_shutdown_handles[i] == handler)
+			break;
+
+	if (i == CRASH_HANDLER_MAX) {
+		printk(KERN_ERR "Crash shutdown handle not found\n");
+		rc = 1;
+	} else {
+		/* Shift handles down */
+		for (; i < (CRASH_HANDLER_MAX - 1); i++)
+			crash_shutdown_handles[i] =
+				crash_shutdown_handles[i+1];
+		/*
+		 * Reset last entry to NULL now that it has been shifted down,
+		 * this will allow new handles to be added here.
+		 */
+		crash_shutdown_handles[i] = NULL;
+		rc = 0;
+	}
+
+	spin_unlock(&crash_handlers_lock);
+	return rc;
+}
+EXPORT_SYMBOL(crash_shutdown_unregister);
+
+void default_machine_crash_shutdown(struct pt_regs *regs)
+{
+	unsigned int i;
+	int (*old_handler)(struct pt_regs *regs);
+
+	/*
+	 * This function is only called after the system
+	 * has panicked or is otherwise in a critical state.
+	 * The minimum amount of code to allow a kexec'd kernel
+	 * to run successfully needs to happen here.
+	 *
+	 * In practice this means stopping other cpus in
+	 * an SMP system.
+	 * The kernel is broken so disable interrupts.
+	 */
+	hard_irq_disable();
+
+	/*
+	 * Make a note of crashing cpu. Will be used in machine_kexec
+	 * such that another IPI will not be sent.
+	 */
+	crashing_cpu = smp_processor_id();
+
+	/*
+	 * If we came in via system reset, wait a while for the secondary
+	 * CPUs to enter.
+	 */
+	if (TRAP(regs) == 0x100)
+		mdelay(PRIMARY_TIMEOUT);
+
+	crash_kexec_prepare_cpus(crashing_cpu);
+
+	crash_save_cpu(regs, crashing_cpu);
+
+	time_to_dump = 1;
+
+	crash_kexec_wait_realmode(crashing_cpu);
+
+	machine_kexec_mask_interrupts();
+
+	/*
+	 * Call registered shutdown routines safely.  Swap out
+	 * __debugger_fault_handler, and replace on exit.
+	 */
+	old_handler = __debugger_fault_handler;
+	__debugger_fault_handler = handle_fault;
+	crash_shutdown_cpu = smp_processor_id();
+	for (i = 0; i < CRASH_HANDLER_MAX && crash_shutdown_handles[i]; i++) {
+		if (setjmp(crash_shutdown_buf) == 0) {
+			/*
+			 * Insert syncs and delay to ensure
+			 * instructions in the dangerous region don't
+			 * leak away from this protected region.
+			 */
+			asm volatile("sync; isync");
+			/* dangerous region */
+			crash_shutdown_handles[i]();
+			asm volatile("sync; isync");
+		}
+	}
+	crash_shutdown_cpu = -1;
+	__debugger_fault_handler = old_handler;
+
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(1, 0);
+}
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
new file mode 100644
index 000000000..d10ad258d
--- /dev/null
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -0,0 +1,148 @@
+/*
+ * Routines for doing kexec-based kdump.
+ *
+ * Copyright (C) 2005, IBM Corp.
+ *
+ * Created by: Michael Ellerman
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#undef DEBUG
+
+#include <linux/crash_dump.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+#include <asm/code-patching.h>
+#include <asm/kdump.h>
+#include <asm/prom.h>
+#include <asm/firmware.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+void __init reserve_kdump_trampoline(void)
+{
+	memblock_reserve(0, KDUMP_RESERVE_LIMIT);
+}
+
+static void __init create_trampoline(unsigned long addr)
+{
+	unsigned int *p = (unsigned int *)addr;
+
+	/* The maximum range of a single instruction branch, is the current
+	 * instruction's address + (32 MB - 4) bytes. For the trampoline we
+	 * need to branch to current address + 32 MB. So we insert a nop at
+	 * the trampoline address, then the next instruction (+ 4 bytes)
+	 * does a branch to (32 MB - 4). The net effect is that when we
+	 * branch to "addr" we jump to ("addr" + 32 MB). Although it requires
+	 * two instructions it doesn't require any registers.
+	 */
+	patch_instruction(p, PPC_INST_NOP);
+	patch_branch(++p, addr + PHYSICAL_START, 0);
+}
+
+void __init setup_kdump_trampoline(void)
+{
+	unsigned long i;
+
+	DBG(" -> setup_kdump_trampoline()\n");
+
+	for (i = KDUMP_TRAMPOLINE_START; i < KDUMP_TRAMPOLINE_END; i += 8) {
+		create_trampoline(i);
+	}
+
+#ifdef CONFIG_PPC_PSERIES
+	create_trampoline(__pa(system_reset_fwnmi) - PHYSICAL_START);
+	create_trampoline(__pa(machine_check_fwnmi) - PHYSICAL_START);
+#endif /* CONFIG_PPC_PSERIES */
+
+	DBG(" <- setup_kdump_trampoline()\n");
+}
+#endif /* CONFIG_NONSTATIC_KERNEL */
+
+static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize,
+                               unsigned long offset, int userbuf)
+{
+	if (userbuf) {
+		if (copy_to_user((char __user *)buf, (vaddr + offset), csize))
+			return -EFAULT;
+	} else
+		memcpy(buf, (vaddr + offset), csize);
+
+	return csize;
+}
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *      space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *      otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+			size_t csize, unsigned long offset, int userbuf)
+{
+	void  *vaddr;
+	phys_addr_t paddr;
+
+	if (!csize)
+		return 0;
+
+	csize = min_t(size_t, csize, PAGE_SIZE);
+	paddr = pfn << PAGE_SHIFT;
+
+	if (memblock_is_region_memory(paddr, csize)) {
+		vaddr = __va(paddr);
+		csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf);
+	} else {
+		vaddr = __ioremap(paddr, PAGE_SIZE, 0);
+		csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf);
+		iounmap(vaddr);
+	}
+
+	return csize;
+}
+
+#ifdef CONFIG_PPC_RTAS
+/*
+ * The crashkernel region will almost always overlap the RTAS region, so
+ * we have to be careful when shrinking the crashkernel region.
+ */
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+	const __be32 *basep, *sizep;
+	unsigned int rtas_start = 0, rtas_end = 0;
+
+	basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
+	sizep = of_get_property(rtas.dev, "rtas-size", NULL);
+
+	if (basep && sizep) {
+		rtas_start = be32_to_cpup(basep);
+		rtas_end = rtas_start + be32_to_cpup(sizep);
+	}
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+		/* Does this page overlap with the RTAS region? */
+		if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start))
+			continue;
+
+		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+	}
+}
+#endif
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
new file mode 100644
index 000000000..582814455
--- /dev/null
+++ b/arch/powerpc/kernel/dbell.c
@@ -0,0 +1,103 @@
+/*
+ * Author: Kumar Gala <galak@kernel.crashing.org>
+ *
+ * Copyright 2009 Freescale Semiconductor Inc.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/hardirq.h>
+
+#include <asm/dbell.h>
+#include <asm/irq_regs.h>
+#include <asm/kvm_ppc.h>
+
+#ifdef CONFIG_SMP
+
+/*
+ * Doorbells must only be used if CPU_FTR_DBELL is available.
+ * msgsnd is used in HV, and msgsndp is used in !HV.
+ *
+ * These should be used by platform code that is aware of restrictions.
+ * Other arch code should use ->cause_ipi.
+ *
+ * doorbell_global_ipi() sends a dbell to any target CPU.
+ * Must be used only by architectures that address msgsnd target
+ * by PIR/get_hard_smp_processor_id.
+ */
+void doorbell_global_ipi(int cpu)
+{
+	u32 tag = get_hard_smp_processor_id(cpu);
+
+	kvmppc_set_host_ipi(cpu);
+	/* Order previous accesses vs. msgsnd, which is treated as a store */
+	ppc_msgsnd_sync();
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
+}
+
+/*
+ * doorbell_core_ipi() sends a dbell to a target CPU in the same core.
+ * Must be used only by architectures that address msgsnd target
+ * by TIR/cpu_thread_in_core.
+ */
+void doorbell_core_ipi(int cpu)
+{
+	u32 tag = cpu_thread_in_core(cpu);
+
+	kvmppc_set_host_ipi(cpu);
+	/* Order previous accesses vs. msgsnd, which is treated as a store */
+	ppc_msgsnd_sync();
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
+}
+
+/*
+ * Attempt to cause a core doorbell if destination is on the same core.
+ * Returns 1 on success, 0 on failure.
+ */
+int doorbell_try_core_ipi(int cpu)
+{
+	int this_cpu = get_cpu();
+	int ret = 0;
+
+	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) {
+		doorbell_core_ipi(cpu);
+		ret = 1;
+	}
+
+	put_cpu();
+
+	return ret;
+}
+
+void doorbell_exception(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	irq_enter();
+
+	ppc_msgsync();
+
+	may_hard_irq_enable();
+
+	kvmppc_clear_host_ipi(smp_processor_id());
+	__this_cpu_inc(irq_stat.doorbell_irqs);
+
+	smp_ipi_demux_relaxed(); /* already performed the barrier */
+
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+#else /* CONFIG_SMP */
+void doorbell_exception(struct pt_regs *regs)
+{
+	printk(KERN_WARNING "Received doorbell on non-smp system\n");
+}
+#endif /* CONFIG_SMP */
+
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
new file mode 100644
index 000000000..eed3543ae
--- /dev/null
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * Provide default implementations of the DMA mapping callbacks for
+ * busses using the iommu infrastructure
+ */
+
+#include <linux/export.h>
+#include <asm/iommu.h>
+
+/*
+ * Generic iommu implementation
+ */
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
+				      dma_addr_t *dma_handle, gfp_t flag,
+				      unsigned long attrs)
+{
+	return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
+				    dma_handle, dev->coherent_dma_mask, flag,
+				    dev_to_node(dev));
+}
+
+static void dma_iommu_free_coherent(struct device *dev, size_t size,
+				    void *vaddr, dma_addr_t dma_handle,
+				    unsigned long attrs)
+{
+	iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
+}
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be
+ * contiguous real kernel storage (not vmalloc).  The address passed here
+ * comprises a page address and offset into that page. The dma_addr_t
+ * returned will point to the same byte within the page as was passed in.
+ */
+static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page,
+				     unsigned long offset, size_t size,
+				     enum dma_data_direction direction,
+				     unsigned long attrs)
+{
+	return iommu_map_page(dev, get_iommu_table_base(dev), page, offset,
+			      size, device_to_mask(dev), direction, attrs);
+}
+
+
+static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
+				 size_t size, enum dma_data_direction direction,
+				 unsigned long attrs)
+{
+	iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction,
+			 attrs);
+}
+
+
+static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+			    int nelems, enum dma_data_direction direction,
+			    unsigned long attrs)
+{
+	return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems,
+				device_to_mask(dev), direction, attrs);
+}
+
+static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction,
+		unsigned long attrs)
+{
+	ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems,
+			   direction, attrs);
+}
+
+/* We support DMA to/from any memory page via the iommu */
+int dma_iommu_dma_supported(struct device *dev, u64 mask)
+{
+	struct iommu_table *tbl = get_iommu_table_base(dev);
+
+	if (!tbl) {
+		dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx"
+			", table unavailable\n", mask);
+		return 0;
+	}
+
+	if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
+		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
+		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
+				mask, tbl->it_offset << tbl->it_page_shift);
+		return 0;
+	} else
+		return 1;
+}
+
+static u64 dma_iommu_get_required_mask(struct device *dev)
+{
+	struct iommu_table *tbl = get_iommu_table_base(dev);
+	u64 mask;
+	if (!tbl)
+		return 0;
+
+	mask = 1ULL << (fls_long(tbl->it_offset + tbl->it_size) +
+			tbl->it_page_shift - 1);
+	mask += mask - 1;
+
+	return mask;
+}
+
+int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == IOMMU_MAPPING_ERROR;
+}
+
+struct dma_map_ops dma_iommu_ops = {
+	.alloc			= dma_iommu_alloc_coherent,
+	.free			= dma_iommu_free_coherent,
+	.mmap			= dma_nommu_mmap_coherent,
+	.map_sg			= dma_iommu_map_sg,
+	.unmap_sg		= dma_iommu_unmap_sg,
+	.dma_supported		= dma_iommu_dma_supported,
+	.map_page		= dma_iommu_map_page,
+	.unmap_page		= dma_iommu_unmap_page,
+	.get_required_mask	= dma_iommu_get_required_mask,
+	.mapping_error		= dma_iommu_mapping_error,
+};
+EXPORT_SYMBOL(dma_iommu_ops);
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
new file mode 100644
index 000000000..88f3963ca
--- /dev/null
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -0,0 +1,128 @@
+/*
+ * Contains routines needed to support swiotlb for ppc.
+ *
+ * Copyright (C) 2009-2010 Freescale Semiconductor, Inc.
+ * Author: Becky Bruce
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/memblock.h>
+#include <linux/pfn.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pci.h>
+
+#include <asm/machdep.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+
+unsigned int ppc_swiotlb_enable;
+
+static u64 swiotlb_powerpc_get_required(struct device *dev)
+{
+	u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr;
+
+	end = memblock_end_of_DRAM();
+	if (max_direct_dma_addr && end > max_direct_dma_addr)
+		end = max_direct_dma_addr;
+	end += get_dma_offset(dev);
+
+	mask = 1ULL << (fls64(end) - 1);
+	mask += mask - 1;
+
+	return mask;
+}
+
+/*
+ * At the moment, all platforms that use this code only require
+ * swiotlb to be used if we're operating on HIGHMEM.  Since
+ * we don't ever call anything other than map_sg, unmap_sg,
+ * map_page, and unmap_page on highmem, use normal dma_ops
+ * for everything else.
+ */
+const struct dma_map_ops powerpc_swiotlb_dma_ops = {
+	.alloc = __dma_nommu_alloc_coherent,
+	.free = __dma_nommu_free_coherent,
+	.mmap = dma_nommu_mmap_coherent,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.dma_supported = swiotlb_dma_supported,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.mapping_error = swiotlb_dma_mapping_error,
+	.get_required_mask = swiotlb_powerpc_get_required,
+};
+
+void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
+{
+	struct pci_controller *hose;
+	struct dev_archdata *sd;
+
+	hose = pci_bus_to_host(pdev->bus);
+	sd = &pdev->dev.archdata;
+	sd->max_direct_dma_addr =
+		hose->dma_window_base_cur + hose->dma_window_size;
+}
+
+static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
+				  unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct dev_archdata *sd;
+
+	/* We are only intereted in device addition */
+	if (action != BUS_NOTIFY_ADD_DEVICE)
+		return 0;
+
+	sd = &dev->archdata;
+	sd->max_direct_dma_addr = 0;
+
+	/* May need to bounce if the device can't address all of DRAM */
+	if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM())
+		set_dma_ops(dev, &powerpc_swiotlb_dma_ops);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ppc_swiotlb_plat_bus_notifier = {
+	.notifier_call = ppc_swiotlb_bus_notify,
+	.priority = 0,
+};
+
+int __init swiotlb_setup_bus_notifier(void)
+{
+	bus_register_notifier(&platform_bus_type,
+			      &ppc_swiotlb_plat_bus_notifier);
+	return 0;
+}
+
+void __init swiotlb_detect_4g(void)
+{
+	if ((memblock_end_of_DRAM() - 1) > 0xffffffff) {
+		ppc_swiotlb_enable = 1;
+#ifdef CONFIG_ZONE_DMA32
+		limit_zone_pfn(ZONE_DMA32, (1ULL << 32) >> PAGE_SHIFT);
+#endif
+	}
+}
+
+static int __init check_swiotlb_enabled(void)
+{
+	if (ppc_swiotlb_enable)
+		swiotlb_print_info();
+	else
+		swiotlb_exit();
+
+	return 0;
+}
+subsys_initcall(check_swiotlb_enabled);
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
new file mode 100644
index 000000000..dbfc7056d
--- /dev/null
+++ b/arch/powerpc/kernel/dma.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * Provide default implementations of the DMA mapping callbacks for
+ * directly mapped busses.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-debug.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <asm/vio.h>
+#include <asm/bug.h>
+#include <asm/machdep.h>
+#include <asm/swiotlb.h>
+#include <asm/iommu.h>
+
+/*
+ * Generic direct DMA implementation
+ *
+ * This implementation supports a per-device offset that can be applied if
+ * the address at which memory is visible to devices is not 0. Platform code
+ * can set archdata.dma_data to an unsigned long holding the offset. By
+ * default the offset is PCI_DRAM_OFFSET.
+ */
+
+static u64 __maybe_unused get_pfn_limit(struct device *dev)
+{
+	u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
+	struct dev_archdata __maybe_unused *sd = &dev->archdata;
+
+#ifdef CONFIG_SWIOTLB
+	if (sd->max_direct_dma_addr && dev->dma_ops == &powerpc_swiotlb_dma_ops)
+		pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
+#endif
+
+	return pfn;
+}
+
+static int dma_nommu_dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PPC64
+	u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1);
+
+	/* Limit fits in the mask, we are good */
+	if (mask >= limit)
+		return 1;
+
+#ifdef CONFIG_FSL_SOC
+	/* Freescale gets another chance via ZONE_DMA/ZONE_DMA32, however
+	 * that will have to be refined if/when they support iommus
+	 */
+	return 1;
+#endif
+	/* Sorry ... */
+	return 0;
+#else
+	return 1;
+#endif
+}
+
+void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
+				  dma_addr_t *dma_handle, gfp_t flag,
+				  unsigned long attrs)
+{
+	void *ret;
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	ret = __dma_alloc_coherent(dev, size, dma_handle, flag);
+	if (ret == NULL)
+		return NULL;
+	*dma_handle += get_dma_offset(dev);
+	return ret;
+#else
+	struct page *page;
+	int node = dev_to_node(dev);
+#ifdef CONFIG_FSL_SOC
+	u64 pfn = get_pfn_limit(dev);
+	int zone;
+
+	/*
+	 * This code should be OK on other platforms, but we have drivers that
+	 * don't set coherent_dma_mask. As a workaround we just ifdef it. This
+	 * whole routine needs some serious cleanup.
+	 */
+
+	zone = dma_pfn_limit_to_zone(pfn);
+	if (zone < 0) {
+		dev_err(dev, "%s: No suitable zone for pfn %#llx\n",
+			__func__, pfn);
+		return NULL;
+	}
+
+	switch (zone) {
+	case ZONE_DMA:
+		flag |= GFP_DMA;
+		break;
+#ifdef CONFIG_ZONE_DMA32
+	case ZONE_DMA32:
+		flag |= GFP_DMA32;
+		break;
+#endif
+	};
+#endif /* CONFIG_FSL_SOC */
+
+	page = alloc_pages_node(node, flag, get_order(size));
+	if (page == NULL)
+		return NULL;
+	ret = page_address(page);
+	memset(ret, 0, size);
+	*dma_handle = __pa(ret) + get_dma_offset(dev);
+
+	return ret;
+#endif
+}
+
+void __dma_nommu_free_coherent(struct device *dev, size_t size,
+				void *vaddr, dma_addr_t dma_handle,
+				unsigned long attrs)
+{
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	__dma_free_coherent(size, vaddr);
+#else
+	free_pages((unsigned long)vaddr, get_order(size));
+#endif
+}
+
+static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
+				       dma_addr_t *dma_handle, gfp_t flag,
+				       unsigned long attrs)
+{
+	struct iommu_table *iommu;
+
+	/* The coherent mask may be smaller than the real mask, check if
+	 * we can really use the direct ops
+	 */
+	if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
+		return __dma_nommu_alloc_coherent(dev, size, dma_handle,
+						   flag, attrs);
+
+	/* Ok we can't ... do we have an iommu ? If not, fail */
+	iommu = get_iommu_table_base(dev);
+	if (!iommu)
+		return NULL;
+
+	/* Try to use the iommu */
+	return iommu_alloc_coherent(dev, iommu, size, dma_handle,
+				    dev->coherent_dma_mask, flag,
+				    dev_to_node(dev));
+}
+
+static void dma_nommu_free_coherent(struct device *dev, size_t size,
+				     void *vaddr, dma_addr_t dma_handle,
+				     unsigned long attrs)
+{
+	struct iommu_table *iommu;
+
+	/* See comments in dma_nommu_alloc_coherent() */
+	if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
+		return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle,
+						  attrs);
+	/* Maybe we used an iommu ... */
+	iommu = get_iommu_table_base(dev);
+
+	/* If we hit that we should have never allocated in the first
+	 * place so how come we are freeing ?
+	 */
+	if (WARN_ON(!iommu))
+		return;
+	iommu_free_coherent(iommu, size, vaddr, dma_handle);
+}
+
+int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
+			     void *cpu_addr, dma_addr_t handle, size_t size,
+			     unsigned long attrs)
+{
+	unsigned long pfn;
+
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
+#else
+	pfn = page_to_pfn(virt_to_page(cpu_addr));
+#endif
+	return remap_pfn_range(vma, vma->vm_start,
+			       pfn + vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
+			     int nents, enum dma_data_direction direction,
+			     unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
+		sg->dma_length = sg->length;
+
+		if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+			continue;
+
+		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
+	}
+
+	return nents;
+}
+
+static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
+				int nents, enum dma_data_direction direction,
+				unsigned long attrs)
+{
+}
+
+static u64 dma_nommu_get_required_mask(struct device *dev)
+{
+	u64 end, mask;
+
+	end = memblock_end_of_DRAM() + get_dma_offset(dev);
+
+	mask = 1ULL << (fls64(end) - 1);
+	mask += mask - 1;
+
+	return mask;
+}
+
+static inline dma_addr_t dma_nommu_map_page(struct device *dev,
+					     struct page *page,
+					     unsigned long offset,
+					     size_t size,
+					     enum dma_data_direction dir,
+					     unsigned long attrs)
+{
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		__dma_sync_page(page, offset, size, dir);
+
+	return page_to_phys(page) + offset + get_dma_offset(dev);
+}
+
+static inline void dma_nommu_unmap_page(struct device *dev,
+					 dma_addr_t dma_address,
+					 size_t size,
+					 enum dma_data_direction direction,
+					 unsigned long attrs)
+{
+}
+
+#ifdef CONFIG_NOT_COHERENT_CACHE
+static inline void dma_nommu_sync_sg(struct device *dev,
+		struct scatterlist *sgl, int nents,
+		enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
+}
+
+static inline void dma_nommu_sync_single(struct device *dev,
+					  dma_addr_t dma_handle, size_t size,
+					  enum dma_data_direction direction)
+{
+	__dma_sync(bus_to_virt(dma_handle), size, direction);
+}
+#endif
+
+const struct dma_map_ops dma_nommu_ops = {
+	.alloc				= dma_nommu_alloc_coherent,
+	.free				= dma_nommu_free_coherent,
+	.mmap				= dma_nommu_mmap_coherent,
+	.map_sg				= dma_nommu_map_sg,
+	.unmap_sg			= dma_nommu_unmap_sg,
+	.dma_supported			= dma_nommu_dma_supported,
+	.map_page			= dma_nommu_map_page,
+	.unmap_page			= dma_nommu_unmap_page,
+	.get_required_mask		= dma_nommu_get_required_mask,
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	.sync_single_for_cpu 		= dma_nommu_sync_single,
+	.sync_single_for_device 	= dma_nommu_sync_single,
+	.sync_sg_for_cpu 		= dma_nommu_sync_sg,
+	.sync_sg_for_device 		= dma_nommu_sync_sg,
+#endif
+};
+EXPORT_SYMBOL(dma_nommu_ops);
+
+int dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+	if (!dma_supported(dev, mask)) {
+		/*
+		 * We need to special case the direct DMA ops which can
+		 * support a fallback for coherent allocations. There
+		 * is no dma_op->set_coherent_mask() so we have to do
+		 * things the hard way:
+		 */
+		if (get_dma_ops(dev) != &dma_nommu_ops ||
+		    get_iommu_table_base(dev) == NULL ||
+		    !dma_iommu_dma_supported(dev, mask))
+			return -EIO;
+	}
+	dev->coherent_dma_mask = mask;
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_coherent_mask);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (ppc_md.dma_set_mask)
+		return ppc_md.dma_set_mask(dev, dma_mask);
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+		if (phb->controller_ops.dma_set_mask)
+			return phb->controller_ops.dma_set_mask(pdev, dma_mask);
+	}
+
+	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+		return -EIO;
+	*dev->dma_mask = dma_mask;
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+u64 __dma_get_required_mask(struct device *dev)
+{
+	const struct dma_map_ops *dma_ops = get_dma_ops(dev);
+
+	if (unlikely(dma_ops == NULL))
+		return 0;
+
+	if (dma_ops->get_required_mask)
+		return dma_ops->get_required_mask(dev);
+
+	return DMA_BIT_MASK(8 * sizeof(dma_addr_t));
+}
+
+u64 dma_get_required_mask(struct device *dev)
+{
+	if (ppc_md.dma_get_required_mask)
+		return ppc_md.dma_get_required_mask(dev);
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+		if (phb->controller_ops.dma_get_required_mask)
+			return phb->controller_ops.dma_get_required_mask(pdev);
+	}
+
+	return __dma_get_required_mask(dev);
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+
+static int __init dma_init(void)
+{
+#ifdef CONFIG_IBMVIO
+	dma_debug_add_bus(&vio_bus_type);
+#endif
+
+       return 0;
+}
+fs_initcall(dma_init);
+
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
new file mode 100644
index 000000000..5195a4fc3
--- /dev/null
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -0,0 +1,1081 @@
+/*
+ * Copyright 2017, Nicholas Piggin, IBM Corporation
+ * Licensed under GPLv2.
+ */
+
+#define pr_fmt(fmt) "dt-cpu-ftrs: " fmt
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/jump_label.h>
+#include <linux/libfdt.h>
+#include <linux/memblock.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/threads.h>
+
+#include <asm/cputable.h>
+#include <asm/dt_cpu_ftrs.h>
+#include <asm/mmu.h>
+#include <asm/oprofile_impl.h>
+#include <asm/prom.h>
+#include <asm/setup.h>
+
+
+/* Device-tree visible constants follow */
+#define ISA_V2_07B      2070
+#define ISA_V3_0B       3000
+
+#define USABLE_PR               (1U << 0)
+#define USABLE_OS               (1U << 1)
+#define USABLE_HV               (1U << 2)
+
+#define HV_SUPPORT_HFSCR        (1U << 0)
+#define OS_SUPPORT_FSCR         (1U << 0)
+
+/* For parsing, we define all bits set as "NONE" case */
+#define HV_SUPPORT_NONE		0xffffffffU
+#define OS_SUPPORT_NONE		0xffffffffU
+
+struct dt_cpu_feature {
+	const char *name;
+	uint32_t isa;
+	uint32_t usable_privilege;
+	uint32_t hv_support;
+	uint32_t os_support;
+	uint32_t hfscr_bit_nr;
+	uint32_t fscr_bit_nr;
+	uint32_t hwcap_bit_nr;
+	/* fdt parsing */
+	unsigned long node;
+	int enabled;
+	int disabled;
+};
+
+#define MMU_FTRS_HASH_BASE (MMU_FTRS_POWER8)
+
+#define COMMON_USER_BASE	(PPC_FEATURE_32 | PPC_FEATURE_64 | \
+				 PPC_FEATURE_ARCH_2_06 |\
+				 PPC_FEATURE_ICACHE_SNOOP)
+#define COMMON_USER2_BASE	(PPC_FEATURE2_ARCH_2_07 | \
+				 PPC_FEATURE2_ISEL)
+/*
+ * Set up the base CPU
+ */
+
+extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p9(struct pt_regs *regs);
+
+static int hv_mode;
+
+static struct {
+	u64	lpcr;
+	u64	lpcr_clear;
+	u64	hfscr;
+	u64	fscr;
+} system_registers;
+
+static void (*init_pmu_registers)(void);
+
+static void __restore_cpu_cpufeatures(void)
+{
+	u64 lpcr;
+
+	/*
+	 * LPCR is restored by the power on engine already. It can be changed
+	 * after early init e.g., by radix enable, and we have no unified API
+	 * for saving and restoring such SPRs.
+	 *
+	 * This ->restore hook should really be removed from idle and register
+	 * restore moved directly into the idle restore code, because this code
+	 * doesn't know how idle is implemented or what it needs restored here.
+	 *
+	 * The best we can do to accommodate secondary boot and idle restore
+	 * for now is "or" LPCR with existing.
+	 */
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr |= system_registers.lpcr;
+	lpcr &= ~system_registers.lpcr_clear;
+	mtspr(SPRN_LPCR, lpcr);
+	if (hv_mode) {
+		mtspr(SPRN_LPID, 0);
+		mtspr(SPRN_HFSCR, system_registers.hfscr);
+		mtspr(SPRN_PCR, 0);
+	}
+	mtspr(SPRN_FSCR, system_registers.fscr);
+
+	if (init_pmu_registers)
+		init_pmu_registers();
+}
+
+static char dt_cpu_name[64];
+
+static struct cpu_spec __initdata base_cpu_spec = {
+	.cpu_name		= NULL,
+	.cpu_features		= CPU_FTRS_DT_CPU_BASE,
+	.cpu_user_features	= COMMON_USER_BASE,
+	.cpu_user_features2	= COMMON_USER2_BASE,
+	.mmu_features		= 0,
+	.icache_bsize		= 32, /* minimum block size, fixed by */
+	.dcache_bsize		= 32, /* cache info init.             */
+	.num_pmcs		= 0,
+	.pmc_type		= PPC_PMC_DEFAULT,
+	.oprofile_cpu_type	= NULL,
+	.oprofile_type		= PPC_OPROFILE_INVALID,
+	.cpu_setup		= NULL,
+	.cpu_restore		= __restore_cpu_cpufeatures,
+	.machine_check_early	= NULL,
+	.platform		= NULL,
+};
+
+static void __init cpufeatures_setup_cpu(void)
+{
+	set_cur_cpu_spec(&base_cpu_spec);
+
+	cur_cpu_spec->pvr_mask = -1;
+	cur_cpu_spec->pvr_value = mfspr(SPRN_PVR);
+
+	/* Initialize the base environment -- clear FSCR/HFSCR.  */
+	hv_mode = !!(mfmsr() & MSR_HV);
+	if (hv_mode) {
+		/* CPU_FTR_HVMODE is used early in PACA setup */
+		cur_cpu_spec->cpu_features |= CPU_FTR_HVMODE;
+		mtspr(SPRN_HFSCR, 0);
+	}
+	mtspr(SPRN_FSCR, 0);
+
+	/*
+	 * LPCR does not get cleared, to match behaviour with secondaries
+	 * in __restore_cpu_cpufeatures. Once the idle code is fixed, this
+	 * could clear LPCR too.
+	 */
+}
+
+static int __init feat_try_enable_unknown(struct dt_cpu_feature *f)
+{
+	if (f->hv_support == HV_SUPPORT_NONE) {
+	} else if (f->hv_support & HV_SUPPORT_HFSCR) {
+		u64 hfscr = mfspr(SPRN_HFSCR);
+		hfscr |= 1UL << f->hfscr_bit_nr;
+		mtspr(SPRN_HFSCR, hfscr);
+	} else {
+		/* Does not have a known recipe */
+		return 0;
+	}
+
+	if (f->os_support == OS_SUPPORT_NONE) {
+	} else if (f->os_support & OS_SUPPORT_FSCR) {
+		u64 fscr = mfspr(SPRN_FSCR);
+		fscr |= 1UL << f->fscr_bit_nr;
+		mtspr(SPRN_FSCR, fscr);
+	} else {
+		/* Does not have a known recipe */
+		return 0;
+	}
+
+	if ((f->usable_privilege & USABLE_PR) && (f->hwcap_bit_nr != -1)) {
+		uint32_t word = f->hwcap_bit_nr / 32;
+		uint32_t bit = f->hwcap_bit_nr % 32;
+
+		if (word == 0)
+			cur_cpu_spec->cpu_user_features |= 1U << bit;
+		else if (word == 1)
+			cur_cpu_spec->cpu_user_features2 |= 1U << bit;
+		else
+			pr_err("%s could not advertise to user (no hwcap bits)\n", f->name);
+	}
+
+	return 1;
+}
+
+static int __init feat_enable(struct dt_cpu_feature *f)
+{
+	if (f->hv_support != HV_SUPPORT_NONE) {
+		if (f->hfscr_bit_nr != -1) {
+			u64 hfscr = mfspr(SPRN_HFSCR);
+			hfscr |= 1UL << f->hfscr_bit_nr;
+			mtspr(SPRN_HFSCR, hfscr);
+		}
+	}
+
+	if (f->os_support != OS_SUPPORT_NONE) {
+		if (f->fscr_bit_nr != -1) {
+			u64 fscr = mfspr(SPRN_FSCR);
+			fscr |= 1UL << f->fscr_bit_nr;
+			mtspr(SPRN_FSCR, fscr);
+		}
+	}
+
+	if ((f->usable_privilege & USABLE_PR) && (f->hwcap_bit_nr != -1)) {
+		uint32_t word = f->hwcap_bit_nr / 32;
+		uint32_t bit = f->hwcap_bit_nr % 32;
+
+		if (word == 0)
+			cur_cpu_spec->cpu_user_features |= 1U << bit;
+		else if (word == 1)
+			cur_cpu_spec->cpu_user_features2 |= 1U << bit;
+		else
+			pr_err("CPU feature: %s could not advertise to user (no hwcap bits)\n", f->name);
+	}
+
+	return 1;
+}
+
+static int __init feat_disable(struct dt_cpu_feature *f)
+{
+	return 0;
+}
+
+static int __init feat_enable_hv(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	if (!hv_mode) {
+		pr_err("CPU feature hypervisor present in device tree but HV mode not enabled in the CPU. Ignoring.\n");
+		return 0;
+	}
+
+	mtspr(SPRN_LPID, 0);
+
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr &=  ~LPCR_LPES0; /* HV external interrupts */
+	mtspr(SPRN_LPCR, lpcr);
+
+	cur_cpu_spec->cpu_features |= CPU_FTR_HVMODE;
+
+	return 1;
+}
+
+static int __init feat_enable_le(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_TRUE_LE;
+	return 1;
+}
+
+static int __init feat_enable_smt(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->cpu_features |= CPU_FTR_SMT;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_SMT;
+	return 1;
+}
+
+static int __init feat_enable_idle_nap(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	/* Set PECE wakeup modes for ISA 207 */
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr |=  LPCR_PECE0;
+	lpcr |=  LPCR_PECE1;
+	lpcr |=  LPCR_PECE2;
+	mtspr(SPRN_LPCR, lpcr);
+
+	return 1;
+}
+
+static int __init feat_enable_align_dsisr(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->cpu_features &= ~CPU_FTR_NODSISRALIGN;
+
+	return 1;
+}
+
+static int __init feat_enable_idle_stop(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	/* Set PECE wakeup modes for ISAv3.0B */
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr |=  LPCR_PECE0;
+	lpcr |=  LPCR_PECE1;
+	lpcr |=  LPCR_PECE2;
+	mtspr(SPRN_LPCR, lpcr);
+
+	return 1;
+}
+
+static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr &= ~LPCR_ISL;
+
+	/* VRMASD */
+	lpcr |= LPCR_VPM0;
+	lpcr &= ~LPCR_VPM1;
+	lpcr |= 0x10UL << LPCR_VRMASD_SH; /* L=1 LP=00 */
+	mtspr(SPRN_LPCR, lpcr);
+
+	cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU;
+
+	return 1;
+}
+
+static int __init feat_enable_mmu_hash_v3(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	system_registers.lpcr_clear |= (LPCR_ISL | LPCR_UPRT | LPCR_HR);
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr &= ~(LPCR_ISL | LPCR_UPRT | LPCR_HR);
+	mtspr(SPRN_LPCR, lpcr);
+
+	cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU;
+
+	return 1;
+}
+
+
+static int __init feat_enable_mmu_radix(struct dt_cpu_feature *f)
+{
+#ifdef CONFIG_PPC_RADIX_MMU
+	cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
+	cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU;
+
+	return 1;
+#endif
+	return 0;
+}
+
+static int __init feat_enable_dscr(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	/*
+	 * Linux relies on FSCR[DSCR] being clear, so that we can take the
+	 * facility unavailable interrupt and track the task's usage of DSCR.
+	 * See facility_unavailable_exception().
+	 * Clear the bit here so that feat_enable() doesn't set it.
+	 */
+	f->fscr_bit_nr = -1;
+
+	feat_enable(f);
+
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr &= ~LPCR_DPFD;
+	lpcr |=  (4UL << LPCR_DPFD_SH);
+	mtspr(SPRN_LPCR, lpcr);
+
+	return 1;
+}
+
+static void hfscr_pmu_enable(void)
+{
+	u64 hfscr = mfspr(SPRN_HFSCR);
+	hfscr |= PPC_BIT(60);
+	mtspr(SPRN_HFSCR, hfscr);
+}
+
+static void init_pmu_power8(void)
+{
+	if (hv_mode) {
+		mtspr(SPRN_MMCRC, 0);
+		mtspr(SPRN_MMCRH, 0);
+	}
+
+	mtspr(SPRN_MMCRA, 0);
+	mtspr(SPRN_MMCR0, 0);
+	mtspr(SPRN_MMCR1, 0);
+	mtspr(SPRN_MMCR2, 0);
+	mtspr(SPRN_MMCRS, 0);
+}
+
+static int __init feat_enable_mce_power8(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->platform = "power8";
+	cur_cpu_spec->machine_check_early = __machine_check_early_realmode_p8;
+
+	return 1;
+}
+
+static int __init feat_enable_pmu_power8(struct dt_cpu_feature *f)
+{
+	hfscr_pmu_enable();
+
+	init_pmu_power8();
+	init_pmu_registers = init_pmu_power8;
+
+	cur_cpu_spec->cpu_features |= CPU_FTR_MMCRA;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_PSERIES_PERFMON_COMPAT;
+	if (pvr_version_is(PVR_POWER8E))
+		cur_cpu_spec->cpu_features |= CPU_FTR_PMAO_BUG;
+
+	cur_cpu_spec->num_pmcs		= 6;
+	cur_cpu_spec->pmc_type		= PPC_PMC_IBM;
+	cur_cpu_spec->oprofile_cpu_type	= "ppc64/power8";
+
+	return 1;
+}
+
+static void init_pmu_power9(void)
+{
+	if (hv_mode)
+		mtspr(SPRN_MMCRC, 0);
+
+	mtspr(SPRN_MMCRA, 0);
+	mtspr(SPRN_MMCR0, 0);
+	mtspr(SPRN_MMCR1, 0);
+	mtspr(SPRN_MMCR2, 0);
+}
+
+static int __init feat_enable_mce_power9(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->platform = "power9";
+	cur_cpu_spec->machine_check_early = __machine_check_early_realmode_p9;
+
+	return 1;
+}
+
+static int __init feat_enable_pmu_power9(struct dt_cpu_feature *f)
+{
+	hfscr_pmu_enable();
+
+	init_pmu_power9();
+	init_pmu_registers = init_pmu_power9;
+
+	cur_cpu_spec->cpu_features |= CPU_FTR_MMCRA;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_PSERIES_PERFMON_COMPAT;
+
+	cur_cpu_spec->num_pmcs		= 6;
+	cur_cpu_spec->pmc_type		= PPC_PMC_IBM;
+	cur_cpu_spec->oprofile_cpu_type	= "ppc64/power9";
+
+	return 1;
+}
+
+static int __init feat_enable_tm(struct dt_cpu_feature *f)
+{
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	feat_enable(f);
+	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NOSC;
+	return 1;
+#endif
+	return 0;
+}
+
+static int __init feat_enable_fp(struct dt_cpu_feature *f)
+{
+	feat_enable(f);
+	cur_cpu_spec->cpu_features &= ~CPU_FTR_FPU_UNAVAILABLE;
+
+	return 1;
+}
+
+static int __init feat_enable_vector(struct dt_cpu_feature *f)
+{
+#ifdef CONFIG_ALTIVEC
+	feat_enable(f);
+	cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC;
+	cur_cpu_spec->cpu_features |= CPU_FTR_VMX_COPY;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC;
+
+	return 1;
+#endif
+	return 0;
+}
+
+static int __init feat_enable_vsx(struct dt_cpu_feature *f)
+{
+#ifdef CONFIG_VSX
+	feat_enable(f);
+	cur_cpu_spec->cpu_features |= CPU_FTR_VSX;
+	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_VSX;
+
+	return 1;
+#endif
+	return 0;
+}
+
+static int __init feat_enable_purr(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->cpu_features |= CPU_FTR_PURR | CPU_FTR_SPURR;
+
+	return 1;
+}
+
+static int __init feat_enable_ebb(struct dt_cpu_feature *f)
+{
+	/*
+	 * PPC_FEATURE2_EBB is enabled in PMU init code because it has
+	 * historically been related to the PMU facility. This may have
+	 * to be decoupled if EBB becomes more generic. For now, follow
+	 * existing convention.
+	 */
+	f->hwcap_bit_nr = -1;
+	feat_enable(f);
+
+	return 1;
+}
+
+static int __init feat_enable_dbell(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	/* P9 has an HFSCR for privileged state */
+	feat_enable(f);
+
+	cur_cpu_spec->cpu_features |= CPU_FTR_DBELL;
+
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr |=  LPCR_PECEDH; /* hyp doorbell wakeup */
+	mtspr(SPRN_LPCR, lpcr);
+
+	return 1;
+}
+
+static int __init feat_enable_hvi(struct dt_cpu_feature *f)
+{
+	u64 lpcr;
+
+	/*
+	 * POWER9 XIVE interrupts including in OPAL XICS compatibility
+	 * are always delivered as hypervisor virtualization interrupts (HVI)
+	 * rather than EE.
+	 *
+	 * However LPES0 is not set here, in the chance that an EE does get
+	 * delivered to the host somehow, the EE handler would not expect it
+	 * to be delivered in LPES0 mode (e.g., using SRR[01]). This could
+	 * happen if there is a bug in interrupt controller code, or IC is
+	 * misconfigured in systemsim.
+	 */
+
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr |= LPCR_HVICE;	/* enable hvi interrupts */
+	lpcr |= LPCR_HEIC;	/* disable ee interrupts when MSR_HV */
+	lpcr |= LPCR_PECE_HVEE; /* hvi can wake from stop */
+	mtspr(SPRN_LPCR, lpcr);
+
+	return 1;
+}
+
+static int __init feat_enable_large_ci(struct dt_cpu_feature *f)
+{
+	cur_cpu_spec->mmu_features |= MMU_FTR_CI_LARGE_PAGE;
+
+	return 1;
+}
+
+struct dt_cpu_feature_match {
+	const char *name;
+	int (*enable)(struct dt_cpu_feature *f);
+	u64 cpu_ftr_bit_mask;
+};
+
+static struct dt_cpu_feature_match __initdata
+		dt_cpu_feature_match_table[] = {
+	{"hypervisor", feat_enable_hv, 0},
+	{"big-endian", feat_enable, 0},
+	{"little-endian", feat_enable_le, CPU_FTR_REAL_LE},
+	{"smt", feat_enable_smt, 0},
+	{"interrupt-facilities", feat_enable, 0},
+	{"timer-facilities", feat_enable, 0},
+	{"timer-facilities-v3", feat_enable, 0},
+	{"debug-facilities", feat_enable, 0},
+	{"come-from-address-register", feat_enable, CPU_FTR_CFAR},
+	{"branch-tracing", feat_enable, 0},
+	{"floating-point", feat_enable_fp, 0},
+	{"vector", feat_enable_vector, 0},
+	{"vector-scalar", feat_enable_vsx, 0},
+	{"vector-scalar-v3", feat_enable, 0},
+	{"decimal-floating-point", feat_enable, 0},
+	{"decimal-integer", feat_enable, 0},
+	{"quadword-load-store", feat_enable, 0},
+	{"vector-crypto", feat_enable, 0},
+	{"mmu-hash", feat_enable_mmu_hash, 0},
+	{"mmu-radix", feat_enable_mmu_radix, 0},
+	{"mmu-hash-v3", feat_enable_mmu_hash_v3, 0},
+	{"virtual-page-class-key-protection", feat_enable, 0},
+	{"transactional-memory", feat_enable_tm, CPU_FTR_TM},
+	{"transactional-memory-v3", feat_enable_tm, 0},
+	{"tm-suspend-hypervisor-assist", feat_enable, CPU_FTR_P9_TM_HV_ASSIST},
+	{"tm-suspend-xer-so-bug", feat_enable, CPU_FTR_P9_TM_XER_SO_BUG},
+	{"idle-nap", feat_enable_idle_nap, 0},
+	{"alignment-interrupt-dsisr", feat_enable_align_dsisr, 0},
+	{"idle-stop", feat_enable_idle_stop, 0},
+	{"machine-check-power8", feat_enable_mce_power8, 0},
+	{"performance-monitor-power8", feat_enable_pmu_power8, 0},
+	{"data-stream-control-register", feat_enable_dscr, CPU_FTR_DSCR},
+	{"event-based-branch", feat_enable_ebb, 0},
+	{"target-address-register", feat_enable, 0},
+	{"branch-history-rolling-buffer", feat_enable, 0},
+	{"control-register", feat_enable, CPU_FTR_CTRL},
+	{"processor-control-facility", feat_enable_dbell, CPU_FTR_DBELL},
+	{"processor-control-facility-v3", feat_enable_dbell, CPU_FTR_DBELL},
+	{"processor-utilization-of-resources-register", feat_enable_purr, 0},
+	{"no-execute", feat_enable, 0},
+	{"strong-access-ordering", feat_enable, CPU_FTR_SAO},
+	{"cache-inhibited-large-page", feat_enable_large_ci, 0},
+	{"coprocessor-icswx", feat_enable, 0},
+	{"hypervisor-virtualization-interrupt", feat_enable_hvi, 0},
+	{"program-priority-register", feat_enable, CPU_FTR_HAS_PPR},
+	{"wait", feat_enable, 0},
+	{"atomic-memory-operations", feat_enable, 0},
+	{"branch-v3", feat_enable, 0},
+	{"copy-paste", feat_enable, 0},
+	{"decimal-floating-point-v3", feat_enable, 0},
+	{"decimal-integer-v3", feat_enable, 0},
+	{"fixed-point-v3", feat_enable, 0},
+	{"floating-point-v3", feat_enable, 0},
+	{"group-start-register", feat_enable, 0},
+	{"pc-relative-addressing", feat_enable, 0},
+	{"machine-check-power9", feat_enable_mce_power9, 0},
+	{"performance-monitor-power9", feat_enable_pmu_power9, 0},
+	{"event-based-branch-v3", feat_enable, 0},
+	{"random-number-generator", feat_enable, 0},
+	{"system-call-vectored", feat_disable, 0},
+	{"trace-interrupt-v3", feat_enable, 0},
+	{"vector-v3", feat_enable, 0},
+	{"vector-binary128", feat_enable, 0},
+	{"vector-binary16", feat_enable, 0},
+	{"wait-v3", feat_enable, 0},
+};
+
+static bool __initdata using_dt_cpu_ftrs;
+static bool __initdata enable_unknown = true;
+
+static int __init dt_cpu_ftrs_parse(char *str)
+{
+	if (!str)
+		return 0;
+
+	if (!strcmp(str, "off"))
+		using_dt_cpu_ftrs = false;
+	else if (!strcmp(str, "known"))
+		enable_unknown = false;
+	else
+		return 1;
+
+	return 0;
+}
+early_param("dt_cpu_ftrs", dt_cpu_ftrs_parse);
+
+static void __init cpufeatures_setup_start(u32 isa)
+{
+	pr_info("setup for ISA %d\n", isa);
+
+	if (isa >= 3000) {
+		cur_cpu_spec->cpu_features |= CPU_FTR_ARCH_300;
+		cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_ARCH_3_00;
+	}
+}
+
+static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
+{
+	const struct dt_cpu_feature_match *m;
+	bool known = false;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dt_cpu_feature_match_table); i++) {
+		m = &dt_cpu_feature_match_table[i];
+		if (!strcmp(f->name, m->name)) {
+			known = true;
+			if (m->enable(f)) {
+				cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
+				break;
+			}
+
+			pr_info("not enabling: %s (disabled or unsupported by kernel)\n",
+				f->name);
+			return false;
+		}
+	}
+
+	if (!known && (!enable_unknown || !feat_try_enable_unknown(f))) {
+		pr_info("not enabling: %s (unknown and unsupported by kernel)\n",
+			f->name);
+		return false;
+	}
+
+	if (known)
+		pr_debug("enabling: %s\n", f->name);
+	else
+		pr_debug("enabling: %s (unknown)\n", f->name);
+
+	return true;
+}
+
+/*
+ * Handle POWER9 broadcast tlbie invalidation issue using
+ * cpu feature flag.
+ */
+static __init void update_tlbie_feature_flag(unsigned long pvr)
+{
+	if (PVR_VER(pvr) == PVR_POWER9) {
+		/*
+		 * Set the tlbie feature flag for anything below
+		 * Nimbus DD 2.3 and Cumulus DD 1.3
+		 */
+		if ((pvr & 0xe000) == 0) {
+			/* Nimbus */
+			if ((pvr & 0xfff) < 0x203)
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+		} else if ((pvr & 0xc000) == 0) {
+			/* Cumulus */
+			if ((pvr & 0xfff) < 0x103)
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+		} else {
+			WARN_ONCE(1, "Unknown PVR");
+			cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+		}
+
+		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG;
+	}
+}
+
+static __init void cpufeatures_cpu_quirks(void)
+{
+	unsigned long version = mfspr(SPRN_PVR);
+
+	/*
+	 * Not all quirks can be derived from the cpufeatures device tree.
+	 */
+	if ((version & 0xffffefff) == 0x004e0200)
+		; /* DD2.0 has no feature flag */
+	else if ((version & 0xffffefff) == 0x004e0201)
+		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
+	else if ((version & 0xffffefff) == 0x004e0202) {
+		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST;
+		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG;
+		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
+	} else if ((version & 0xffff0000) == 0x004e0000)
+		/* DD2.1 and up have DD2_1 */
+		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
+
+	if ((version & 0xffff0000) == 0x004e0000) {
+		cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR);
+		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR;
+	}
+
+	update_tlbie_feature_flag(version);
+	/*
+	 * PKEY was not in the initial base or feature node
+	 * specification, but it should become optional in the next
+	 * cpu feature version sequence.
+	 */
+	cur_cpu_spec->cpu_features |= CPU_FTR_PKEY;
+}
+
+static void __init cpufeatures_setup_finished(void)
+{
+	cpufeatures_cpu_quirks();
+
+	if (hv_mode && !(cur_cpu_spec->cpu_features & CPU_FTR_HVMODE)) {
+		pr_err("hypervisor not present in device tree but HV mode is enabled in the CPU. Enabling.\n");
+		cur_cpu_spec->cpu_features |= CPU_FTR_HVMODE;
+	}
+
+	/* Make sure powerpc_base_platform is non-NULL */
+	powerpc_base_platform = cur_cpu_spec->platform;
+
+	system_registers.lpcr = mfspr(SPRN_LPCR);
+	system_registers.hfscr = mfspr(SPRN_HFSCR);
+	system_registers.fscr = mfspr(SPRN_FSCR);
+
+	pr_info("final cpu/mmu features = 0x%016lx 0x%08x\n",
+		cur_cpu_spec->cpu_features, cur_cpu_spec->mmu_features);
+}
+
+static int __init disabled_on_cmdline(void)
+{
+	unsigned long root, chosen;
+	const char *p;
+
+	root = of_get_flat_dt_root();
+	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+	if (chosen == -FDT_ERR_NOTFOUND)
+		return false;
+
+	p = of_get_flat_dt_prop(chosen, "bootargs", NULL);
+	if (!p)
+		return false;
+
+	if (strstr(p, "dt_cpu_ftrs=off"))
+		return true;
+
+	return false;
+}
+
+static int __init fdt_find_cpu_features(unsigned long node, const char *uname,
+					int depth, void *data)
+{
+	if (of_flat_dt_is_compatible(node, "ibm,powerpc-cpu-features")
+	    && of_get_flat_dt_prop(node, "isa", NULL))
+		return 1;
+
+	return 0;
+}
+
+bool __init dt_cpu_ftrs_in_use(void)
+{
+	return using_dt_cpu_ftrs;
+}
+
+bool __init dt_cpu_ftrs_init(void *fdt)
+{
+	using_dt_cpu_ftrs = false;
+
+	/* Setup and verify the FDT, if it fails we just bail */
+	if (!early_init_dt_verify(fdt))
+		return false;
+
+	if (!of_scan_flat_dt(fdt_find_cpu_features, NULL))
+		return false;
+
+	if (disabled_on_cmdline())
+		return false;
+
+	cpufeatures_setup_cpu();
+
+	using_dt_cpu_ftrs = true;
+	return true;
+}
+
+static int nr_dt_cpu_features;
+static struct dt_cpu_feature *dt_cpu_features;
+
+static int __init process_cpufeatures_node(unsigned long node,
+					  const char *uname, int i)
+{
+	const __be32 *prop;
+	struct dt_cpu_feature *f;
+	int len;
+
+	f = &dt_cpu_features[i];
+	memset(f, 0, sizeof(struct dt_cpu_feature));
+
+	f->node = node;
+
+	f->name = uname;
+
+	prop = of_get_flat_dt_prop(node, "isa", &len);
+	if (!prop) {
+		pr_warn("%s: missing isa property\n", uname);
+		return 0;
+	}
+	f->isa = be32_to_cpup(prop);
+
+	prop = of_get_flat_dt_prop(node, "usable-privilege", &len);
+	if (!prop) {
+		pr_warn("%s: missing usable-privilege property", uname);
+		return 0;
+	}
+	f->usable_privilege = be32_to_cpup(prop);
+
+	prop = of_get_flat_dt_prop(node, "hv-support", &len);
+	if (prop)
+		f->hv_support = be32_to_cpup(prop);
+	else
+		f->hv_support = HV_SUPPORT_NONE;
+
+	prop = of_get_flat_dt_prop(node, "os-support", &len);
+	if (prop)
+		f->os_support = be32_to_cpup(prop);
+	else
+		f->os_support = OS_SUPPORT_NONE;
+
+	prop = of_get_flat_dt_prop(node, "hfscr-bit-nr", &len);
+	if (prop)
+		f->hfscr_bit_nr = be32_to_cpup(prop);
+	else
+		f->hfscr_bit_nr = -1;
+	prop = of_get_flat_dt_prop(node, "fscr-bit-nr", &len);
+	if (prop)
+		f->fscr_bit_nr = be32_to_cpup(prop);
+	else
+		f->fscr_bit_nr = -1;
+	prop = of_get_flat_dt_prop(node, "hwcap-bit-nr", &len);
+	if (prop)
+		f->hwcap_bit_nr = be32_to_cpup(prop);
+	else
+		f->hwcap_bit_nr = -1;
+
+	if (f->usable_privilege & USABLE_HV) {
+		if (!(mfmsr() & MSR_HV)) {
+			pr_warn("%s: HV feature passed to guest\n", uname);
+			return 0;
+		}
+
+		if (f->hv_support == HV_SUPPORT_NONE && f->hfscr_bit_nr != -1) {
+			pr_warn("%s: unwanted hfscr_bit_nr\n", uname);
+			return 0;
+		}
+
+		if (f->hv_support == HV_SUPPORT_HFSCR) {
+			if (f->hfscr_bit_nr == -1) {
+				pr_warn("%s: missing hfscr_bit_nr\n", uname);
+				return 0;
+			}
+		}
+	} else {
+		if (f->hv_support != HV_SUPPORT_NONE || f->hfscr_bit_nr != -1) {
+			pr_warn("%s: unwanted hv_support/hfscr_bit_nr\n", uname);
+			return 0;
+		}
+	}
+
+	if (f->usable_privilege & USABLE_OS) {
+		if (f->os_support == OS_SUPPORT_NONE && f->fscr_bit_nr != -1) {
+			pr_warn("%s: unwanted fscr_bit_nr\n", uname);
+			return 0;
+		}
+
+		if (f->os_support == OS_SUPPORT_FSCR) {
+			if (f->fscr_bit_nr == -1) {
+				pr_warn("%s: missing fscr_bit_nr\n", uname);
+				return 0;
+			}
+		}
+	} else {
+		if (f->os_support != OS_SUPPORT_NONE || f->fscr_bit_nr != -1) {
+			pr_warn("%s: unwanted os_support/fscr_bit_nr\n", uname);
+			return 0;
+		}
+	}
+
+	if (!(f->usable_privilege & USABLE_PR)) {
+		if (f->hwcap_bit_nr != -1) {
+			pr_warn("%s: unwanted hwcap_bit_nr\n", uname);
+			return 0;
+		}
+	}
+
+	/* Do all the independent features in the first pass */
+	if (!of_get_flat_dt_prop(node, "dependencies", &len)) {
+		if (cpufeatures_process_feature(f))
+			f->enabled = 1;
+		else
+			f->disabled = 1;
+	}
+
+	return 0;
+}
+
+static void __init cpufeatures_deps_enable(struct dt_cpu_feature *f)
+{
+	const __be32 *prop;
+	int len;
+	int nr_deps;
+	int i;
+
+	if (f->enabled || f->disabled)
+		return;
+
+	prop = of_get_flat_dt_prop(f->node, "dependencies", &len);
+	if (!prop) {
+		pr_warn("%s: missing dependencies property", f->name);
+		return;
+	}
+
+	nr_deps = len / sizeof(int);
+
+	for (i = 0; i < nr_deps; i++) {
+		unsigned long phandle = be32_to_cpu(prop[i]);
+		int j;
+
+		for (j = 0; j < nr_dt_cpu_features; j++) {
+			struct dt_cpu_feature *d = &dt_cpu_features[j];
+
+			if (of_get_flat_dt_phandle(d->node) == phandle) {
+				cpufeatures_deps_enable(d);
+				if (d->disabled) {
+					f->disabled = 1;
+					return;
+				}
+			}
+		}
+	}
+
+	if (cpufeatures_process_feature(f))
+		f->enabled = 1;
+	else
+		f->disabled = 1;
+}
+
+static int __init scan_cpufeatures_subnodes(unsigned long node,
+					  const char *uname,
+					  void *data)
+{
+	int *count = data;
+
+	process_cpufeatures_node(node, uname, *count);
+
+	(*count)++;
+
+	return 0;
+}
+
+static int __init count_cpufeatures_subnodes(unsigned long node,
+					  const char *uname,
+					  void *data)
+{
+	int *count = data;
+
+	(*count)++;
+
+	return 0;
+}
+
+static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
+					    *uname, int depth, void *data)
+{
+	const __be32 *prop;
+	int count, i;
+	u32 isa;
+
+	/* We are scanning "ibm,powerpc-cpu-features" nodes only */
+	if (!of_flat_dt_is_compatible(node, "ibm,powerpc-cpu-features"))
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "isa", NULL);
+	if (!prop)
+		/* We checked before, "can't happen" */
+		return 0;
+
+	isa = be32_to_cpup(prop);
+
+	/* Count and allocate space for cpu features */
+	of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes,
+						&nr_dt_cpu_features);
+	dt_cpu_features = __va(
+		memblock_alloc(sizeof(struct dt_cpu_feature)*
+				nr_dt_cpu_features, PAGE_SIZE));
+
+	cpufeatures_setup_start(isa);
+
+	/* Scan nodes into dt_cpu_features and enable those without deps  */
+	count = 0;
+	of_scan_flat_dt_subnodes(node, scan_cpufeatures_subnodes, &count);
+
+	/* Recursive enable remaining features with dependencies */
+	for (i = 0; i < nr_dt_cpu_features; i++) {
+		struct dt_cpu_feature *f = &dt_cpu_features[i];
+
+		cpufeatures_deps_enable(f);
+	}
+
+	prop = of_get_flat_dt_prop(node, "display-name", NULL);
+	if (prop && strlen((char *)prop) != 0) {
+		strlcpy(dt_cpu_name, (char *)prop, sizeof(dt_cpu_name));
+		cur_cpu_spec->cpu_name = dt_cpu_name;
+	}
+
+	cpufeatures_setup_finished();
+
+	memblock_free(__pa(dt_cpu_features),
+			sizeof(struct dt_cpu_feature)*nr_dt_cpu_features);
+
+	return 0;
+}
+
+void __init dt_cpu_ftrs_scan(void)
+{
+	if (!using_dt_cpu_ftrs)
+		return;
+
+	of_scan_flat_dt(dt_cpu_ftrs_scan_callback, NULL);
+}
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
new file mode 100644
index 000000000..44bb522fb
--- /dev/null
+++ b/arch/powerpc/kernel/eeh.c
@@ -0,0 +1,1844 @@
+/*
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ * Copyright 2001-2012 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/reboot.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+
+#include <linux/atomic.h>
+#include <asm/debugfs.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+#include <asm/pte-walk.h>
+
+
+/** Overview:
+ *  EEH, or "Enhanced Error Handling" is a PCI bridge technology for
+ *  dealing with PCI bus errors that can't be dealt with within the
+ *  usual PCI framework, except by check-stopping the CPU.  Systems
+ *  that are designed for high-availability/reliability cannot afford
+ *  to crash due to a "mere" PCI error, thus the need for EEH.
+ *  An EEH-capable bridge operates by converting a detected error
+ *  into a "slot freeze", taking the PCI adapter off-line, making
+ *  the slot behave, from the OS'es point of view, as if the slot
+ *  were "empty": all reads return 0xff's and all writes are silently
+ *  ignored.  EEH slot isolation events can be triggered by parity
+ *  errors on the address or data busses (e.g. during posted writes),
+ *  which in turn might be caused by low voltage on the bus, dust,
+ *  vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ *  Note, however, that one of the leading causes of EEH slot
+ *  freeze events are buggy device drivers, buggy device microcode,
+ *  or buggy device hardware.  This is because any attempt by the
+ *  device to bus-master data to a memory address that is not
+ *  assigned to the device will trigger a slot freeze.   (The idea
+ *  is to prevent devices-gone-wild from corrupting system memory).
+ *  Buggy hardware/drivers will have a miserable time co-existing
+ *  with EEH.
+ *
+ *  Ideally, a PCI device driver, when suspecting that an isolation
+ *  event has occurred (e.g. by reading 0xff's), will then ask EEH
+ *  whether this is the case, and then take appropriate steps to
+ *  reset the PCI slot, the PCI device, and then resume operations.
+ *  However, until that day,  the checking is done here, with the
+ *  eeh_check_failure() routine embedded in the MMIO macros.  If
+ *  the slot is found to be isolated, an "EEH Event" is synthesized
+ *  and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event, it might be broken.
+ * This sets the threshold for how many read attempts we allow
+ * before printing an error message.
+ */
+#define EEH_MAX_FAILS	2100000
+
+/* Time to wait for a PCI slot to report status, in milliseconds */
+#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000)
+
+/*
+ * EEH probe mode support, which is part of the flags,
+ * is to support multiple platforms for EEH. Some platforms
+ * like pSeries do PCI emunation based on device tree.
+ * However, other platforms like powernv probe PCI devices
+ * from hardware. The flag is used to distinguish that.
+ * In addition, struct eeh_ops::probe would be invoked for
+ * particular OF node or PCI device so that the corresponding
+ * PE would be created there.
+ */
+int eeh_subsystem_flags;
+EXPORT_SYMBOL(eeh_subsystem_flags);
+
+/*
+ * EEH allowed maximal frozen times. If one particular PE's
+ * frozen count in last hour exceeds this limit, the PE will
+ * be forced to be offline permanently.
+ */
+int eeh_max_freezes = 5;
+
+/* Platform dependent EEH operations */
+struct eeh_ops *eeh_ops = NULL;
+
+/* Lock to avoid races due to multiple reports of an error */
+DEFINE_RAW_SPINLOCK(confirm_error_lock);
+EXPORT_SYMBOL_GPL(confirm_error_lock);
+
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
+/* Buffer for reporting pci register dumps. Its here in BSS, and
+ * not dynamically alloced, so that it ends up in RMO where RTAS
+ * can access it.
+ */
+#define EEH_PCI_REGS_LOG_LEN 8192
+static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
+
+/*
+ * The struct is used to maintain the EEH global statistic
+ * information. Besides, the EEH global statistics will be
+ * exported to user space through procfs
+ */
+struct eeh_stats {
+	u64 no_device;		/* PCI device not found		*/
+	u64 no_dn;		/* OF node not found		*/
+	u64 no_cfg_addr;	/* Config address not found	*/
+	u64 ignored_check;	/* EEH check skipped		*/
+	u64 total_mmio_ffs;	/* Total EEH checks		*/
+	u64 false_positives;	/* Unnecessary EEH checks	*/
+	u64 slot_resets;	/* PE reset			*/
+};
+
+static struct eeh_stats eeh_stats;
+
+static int __init eeh_setup(char *str)
+{
+	if (!strcmp(str, "off"))
+		eeh_add_flag(EEH_FORCE_DISABLED);
+	else if (!strcmp(str, "early_log"))
+		eeh_add_flag(EEH_EARLY_DUMP_LOG);
+
+	return 1;
+}
+__setup("eeh=", eeh_setup);
+
+/*
+ * This routine captures assorted PCI configuration space data
+ * for the indicated PCI device, and puts them into a buffer
+ * for RTAS error logging.
+ */
+static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+	u32 cfg;
+	int cap, i;
+	int n = 0, l = 0;
+	char buffer[128];
+
+	if (!pdn) {
+		pr_warn("EEH: Note: No error log for absent device.\n");
+		return 0;
+	}
+
+	n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
+		       pdn->phb->global_number, pdn->busno,
+		       PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+	pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+		pdn->phb->global_number, pdn->busno,
+		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+
+	eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
+	n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
+	pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+
+	eeh_ops->read_config(pdn, PCI_COMMAND, 4, &cfg);
+	n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
+	pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+
+	/* Gather bridge-specific registers */
+	if (edev->mode & EEH_DEV_BRIDGE) {
+		eeh_ops->read_config(pdn, PCI_SEC_STATUS, 2, &cfg);
+		n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
+		pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+
+		eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg);
+		n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
+		pr_warn("EEH: Bridge control: %04x\n", cfg);
+	}
+
+	/* Dump out the PCI-X command and status regs */
+	cap = edev->pcix_cap;
+	if (cap) {
+		eeh_ops->read_config(pdn, cap, 4, &cfg);
+		n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
+		pr_warn("EEH: PCI-X cmd: %08x\n", cfg);
+
+		eeh_ops->read_config(pdn, cap+4, 4, &cfg);
+		n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
+		pr_warn("EEH: PCI-X status: %08x\n", cfg);
+	}
+
+	/* If PCI-E capable, dump PCI-E cap 10 */
+	cap = edev->pcie_cap;
+	if (cap) {
+		n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
+		pr_warn("EEH: PCI-E capabilities and status follow:\n");
+
+		for (i=0; i<=8; i++) {
+			eeh_ops->read_config(pdn, cap+4*i, 4, &cfg);
+			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+
+			if ((i % 4) == 0) {
+				if (i != 0)
+					pr_warn("%s\n", buffer);
+
+				l = scnprintf(buffer, sizeof(buffer),
+					      "EEH: PCI-E %02x: %08x ",
+					      4*i, cfg);
+			} else {
+				l += scnprintf(buffer+l, sizeof(buffer)-l,
+					       "%08x ", cfg);
+			}
+
+		}
+
+		pr_warn("%s\n", buffer);
+	}
+
+	/* If AER capable, dump it */
+	cap = edev->aer_cap;
+	if (cap) {
+		n += scnprintf(buf+n, len-n, "pci-e AER:\n");
+		pr_warn("EEH: PCI-E AER capability register set follows:\n");
+
+		for (i=0; i<=13; i++) {
+			eeh_ops->read_config(pdn, cap+4*i, 4, &cfg);
+			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+
+			if ((i % 4) == 0) {
+				if (i != 0)
+					pr_warn("%s\n", buffer);
+
+				l = scnprintf(buffer, sizeof(buffer),
+					      "EEH: PCI-E AER %02x: %08x ",
+					      4*i, cfg);
+			} else {
+				l += scnprintf(buffer+l, sizeof(buffer)-l,
+					       "%08x ", cfg);
+			}
+		}
+
+		pr_warn("%s\n", buffer);
+	}
+
+	return n;
+}
+
+static void *eeh_dump_pe_log(struct eeh_pe *pe, void *flag)
+{
+	struct eeh_dev *edev, *tmp;
+	size_t *plen = flag;
+
+	eeh_pe_for_each_dev(pe, edev, tmp)
+		*plen += eeh_dump_dev_log(edev, pci_regs_buf + *plen,
+					  EEH_PCI_REGS_LOG_LEN - *plen);
+
+	return NULL;
+}
+
+/**
+ * eeh_slot_error_detail - Generate combined log including driver log and error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ *
+ * This routine should be called to generate the combined log, which
+ * is comprised of driver log and error log. The driver log is figured
+ * out from the config space of the corresponding PCI device, while
+ * the error log is fetched through platform dependent function call.
+ */
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
+{
+	size_t loglen = 0;
+
+	/*
+	 * When the PHB is fenced or dead, it's pointless to collect
+	 * the data from PCI config space because it should return
+	 * 0xFF's. For ER, we still retrieve the data from the PCI
+	 * config space.
+	 *
+	 * For pHyp, we have to enable IO for log retrieval. Otherwise,
+	 * 0xFF's is always returned from PCI config space.
+	 *
+	 * When the @severity is EEH_LOG_PERM, the PE is going to be
+	 * removed. Prior to that, the drivers for devices included in
+	 * the PE will be closed. The drivers rely on working IO path
+	 * to bring the devices to quiet state. Otherwise, PCI traffic
+	 * from those devices after they are removed is like to cause
+	 * another unexpected EEH error.
+	 */
+	if (!(pe->type & EEH_PE_PHB)) {
+		if (eeh_has_flag(EEH_ENABLE_IO_FOR_LOG) ||
+		    severity == EEH_LOG_PERM)
+			eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+
+		/*
+		 * The config space of some PCI devices can't be accessed
+		 * when their PEs are in frozen state. Otherwise, fenced
+		 * PHB might be seen. Those PEs are identified with flag
+		 * EEH_PE_CFG_RESTRICTED, indicating EEH_PE_CFG_BLOCKED
+		 * is set automatically when the PE is put to EEH_PE_ISOLATED.
+		 *
+		 * Restoring BARs possibly triggers PCI config access in
+		 * (OPAL) firmware and then causes fenced PHB. If the
+		 * PCI config is blocked with flag EEH_PE_CFG_BLOCKED, it's
+		 * pointless to restore BARs and dump config space.
+		 */
+		eeh_ops->configure_bridge(pe);
+		if (!(pe->state & EEH_PE_CFG_BLOCKED)) {
+			eeh_pe_restore_bars(pe);
+
+			pci_regs_buf[0] = 0;
+			eeh_pe_traverse(pe, eeh_dump_pe_log, &loglen);
+		}
+	}
+
+	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
+}
+
+/**
+ * eeh_token_to_phys - Convert EEH address token to phys address
+ * @token: I/O token, should be address in the form 0xA....
+ *
+ * This routine should be called to convert virtual I/O address
+ * to physical one.
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+	pte_t *ptep;
+	unsigned long pa;
+	int hugepage_shift;
+
+	/*
+	 * We won't find hugepages here(this is iomem). Hence we are not
+	 * worried about _PAGE_SPLITTING/collapse. Also we will not hit
+	 * page table free, because of init_mm.
+	 */
+	ptep = find_init_mm_pte(token, &hugepage_shift);
+	if (!ptep)
+		return token;
+
+	pa = pte_pfn(*ptep);
+
+	/* On radix we can do hugepage mappings for io, so handle that */
+	if (!hugepage_shift)
+		hugepage_shift = PAGE_SHIFT;
+
+	pa <<= PAGE_SHIFT;
+	pa |= token & ((1ul << hugepage_shift) - 1);
+	return pa;
+}
+
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+	struct eeh_pe *phb_pe;
+	unsigned long flags;
+	int ret;
+
+	if (!eeh_has_flag(EEH_PROBE_MODE_DEV))
+		return -EPERM;
+
+	/* Find the PHB PE */
+	phb_pe = eeh_phb_pe_get(pe->phb);
+	if (!phb_pe) {
+		pr_warn("%s Can't find PE for PHB#%x\n",
+			__func__, pe->phb->global_number);
+		return -EEXIST;
+	}
+
+	/* If the PHB has been in problematic state */
+	eeh_serialize_lock(&flags);
+	if (phb_pe->state & EEH_PE_ISOLATED) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Check PHB state */
+	ret = eeh_ops->get_state(phb_pe, NULL);
+	if ((ret < 0) ||
+	    (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Isolate the PHB and send event */
+	eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+	eeh_serialize_unlock(flags);
+
+	pr_err("EEH: PHB#%x failure detected, location: %s\n",
+		phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
+	dump_stack();
+	eeh_send_failure_event(phb_pe);
+
+	return 1;
+out:
+	eeh_serialize_unlock(flags);
+	return ret;
+}
+
+/**
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
+ *
+ * Check for an EEH failure for the given device node.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dev_check_failure(struct eeh_dev *edev)
+{
+	int ret;
+	unsigned long flags;
+	struct device_node *dn;
+	struct pci_dev *dev;
+	struct eeh_pe *pe, *parent_pe, *phb_pe;
+	int rc = 0;
+	const char *location = NULL;
+
+	eeh_stats.total_mmio_ffs++;
+
+	if (!eeh_enabled())
+		return 0;
+
+	if (!edev) {
+		eeh_stats.no_dn++;
+		return 0;
+	}
+	dev = eeh_dev_to_pci_dev(edev);
+	pe = eeh_dev_to_pe(edev);
+
+	/* Access to IO BARs might get this far and still not want checking. */
+	if (!pe) {
+		eeh_stats.ignored_check++;
+		pr_debug("EEH: Ignored check for %s\n",
+			eeh_pci_name(dev));
+		return 0;
+	}
+
+	if (!pe->addr && !pe->config_addr) {
+		eeh_stats.no_cfg_addr++;
+		return 0;
+	}
+
+	/*
+	 * On PowerNV platform, we might already have fenced PHB
+	 * there and we need take care of that firstly.
+	 */
+	ret = eeh_phb_check_failure(pe);
+	if (ret > 0)
+		return ret;
+
+	/*
+	 * If the PE isn't owned by us, we shouldn't check the
+	 * state. Instead, let the owner handle it if the PE has
+	 * been frozen.
+	 */
+	if (eeh_pe_passed(pe))
+		return 0;
+
+	/* If we already have a pending isolation event for this
+	 * slot, we know it's bad already, we don't need to check.
+	 * Do this checking under a lock; as multiple PCI devices
+	 * in one slot might report errors simultaneously, and we
+	 * only want one error recovery routine running.
+	 */
+	eeh_serialize_lock(&flags);
+	rc = 1;
+	if (pe->state & EEH_PE_ISOLATED) {
+		pe->check_count++;
+		if (pe->check_count == EEH_MAX_FAILS) {
+			dn = pci_device_to_OF_node(dev);
+			if (dn)
+				location = of_get_property(dn, "ibm,loc-code",
+						NULL);
+			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
+				"location=%s driver=%s pci addr=%s\n",
+				pe->check_count,
+				location ? location : "unknown",
+				eeh_driver_name(dev), eeh_pci_name(dev));
+			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+				eeh_driver_name(dev));
+			dump_stack();
+		}
+		goto dn_unlock;
+	}
+
+	/*
+	 * Now test for an EEH failure.  This is VERY expensive.
+	 * Note that the eeh_config_addr may be a parent device
+	 * in the case of a device behind a bridge, or it may be
+	 * function zero of a multi-function device.
+	 * In any case they must share a common PHB.
+	 */
+	ret = eeh_ops->get_state(pe, NULL);
+
+	/* Note that config-io to empty slots may fail;
+	 * they are empty when they don't have children.
+	 * We will punt with the following conditions: Failure to get
+	 * PE's state, EEH not support and Permanently unavailable
+	 * state, PE is in good state.
+	 */
+	if ((ret < 0) ||
+	    (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+		eeh_stats.false_positives++;
+		pe->false_positives++;
+		rc = 0;
+		goto dn_unlock;
+	}
+
+	/*
+	 * It should be corner case that the parent PE has been
+	 * put into frozen state as well. We should take care
+	 * that at first.
+	 */
+	parent_pe = pe->parent;
+	while (parent_pe) {
+		/* Hit the ceiling ? */
+		if (parent_pe->type & EEH_PE_PHB)
+			break;
+
+		/* Frozen parent PE ? */
+		ret = eeh_ops->get_state(parent_pe, NULL);
+		if (ret > 0 && !eeh_state_active(ret)) {
+			pe = parent_pe;
+			pr_err("EEH: Failure of PHB#%x-PE#%x will be handled at parent PHB#%x-PE#%x.\n",
+			       pe->phb->global_number, pe->addr,
+			       pe->phb->global_number, parent_pe->addr);
+		}
+
+		/* Next parent level */
+		parent_pe = parent_pe->parent;
+	}
+
+	eeh_stats.slot_resets++;
+
+	/* Avoid repeated reports of this failure, including problems
+	 * with other functions on this device, and functions under
+	 * bridges.
+	 */
+	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+	eeh_serialize_unlock(flags);
+
+	/* Most EEH events are due to device driver bugs.  Having
+	 * a stack trace will help the device-driver authors figure
+	 * out what happened.  So print that out.
+	 */
+	phb_pe = eeh_phb_pe_get(pe->phb);
+	pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+	       pe->phb->global_number, pe->addr);
+	pr_err("EEH: PE location: %s, PHB location: %s\n",
+	       eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
+	dump_stack();
+
+	eeh_send_failure_event(pe);
+
+	return 1;
+
+dn_unlock:
+	eeh_serialize_unlock(flags);
+	return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
+
+/**
+ * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @token: I/O address
+ *
+ * Check for an EEH failure at the given I/O address. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event. This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+int eeh_check_failure(const volatile void __iomem *token)
+{
+	unsigned long addr;
+	struct eeh_dev *edev;
+
+	/* Finding the phys addr + pci device; this is pretty quick. */
+	addr = eeh_token_to_phys((unsigned long __force) token);
+	edev = eeh_addr_cache_get_dev(addr);
+	if (!edev) {
+		eeh_stats.no_device++;
+		return 0;
+	}
+
+	return eeh_dev_check_failure(edev);
+}
+EXPORT_SYMBOL(eeh_check_failure);
+
+
+/**
+ * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
+ * @pe: EEH PE
+ *
+ * This routine should be called to reenable frozen MMIO or DMA
+ * so that it would work correctly again. It's useful while doing
+ * recovery or log collection on the indicated device.
+ */
+int eeh_pci_enable(struct eeh_pe *pe, int function)
+{
+	int active_flag, rc;
+
+	/*
+	 * pHyp doesn't allow to enable IO or DMA on unfrozen PE.
+	 * Also, it's pointless to enable them on unfrozen PE. So
+	 * we have to check before enabling IO or DMA.
+	 */
+	switch (function) {
+	case EEH_OPT_THAW_MMIO:
+		active_flag = EEH_STATE_MMIO_ACTIVE | EEH_STATE_MMIO_ENABLED;
+		break;
+	case EEH_OPT_THAW_DMA:
+		active_flag = EEH_STATE_DMA_ACTIVE;
+		break;
+	case EEH_OPT_DISABLE:
+	case EEH_OPT_ENABLE:
+	case EEH_OPT_FREEZE_PE:
+		active_flag = 0;
+		break;
+	default:
+		pr_warn("%s: Invalid function %d\n",
+			__func__, function);
+		return -EINVAL;
+	}
+
+	/*
+	 * Check if IO or DMA has been enabled before
+	 * enabling them.
+	 */
+	if (active_flag) {
+		rc = eeh_ops->get_state(pe, NULL);
+		if (rc < 0)
+			return rc;
+
+		/* Needn't enable it at all */
+		if (rc == EEH_STATE_NOT_SUPPORT)
+			return 0;
+
+		/* It's already enabled */
+		if (rc & active_flag)
+			return 0;
+	}
+
+
+	/* Issue the request */
+	rc = eeh_ops->set_option(pe, function);
+	if (rc)
+		pr_warn("%s: Unexpected state change %d on "
+			"PHB#%x-PE#%x, err=%d\n",
+			__func__, function, pe->phb->global_number,
+			pe->addr, rc);
+
+	/* Check if the request is finished successfully */
+	if (active_flag) {
+		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+		if (rc < 0)
+			return rc;
+
+		if (rc & active_flag)
+			return 0;
+
+		return -EIO;
+	}
+
+	return rc;
+}
+
+static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
+					    void *userdata)
+{
+	struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
+	struct pci_dev *dev = userdata;
+
+	/*
+	 * The caller should have disabled and saved the
+	 * state for the specified device
+	 */
+	if (!pdev || pdev == dev)
+		return NULL;
+
+	/* Ensure we have D0 power state */
+	pci_set_power_state(pdev, PCI_D0);
+
+	/* Save device state */
+	pci_save_state(pdev);
+
+	/*
+	 * Disable device to avoid any DMA traffic and
+	 * interrupt from the device
+	 */
+	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+
+	return NULL;
+}
+
+static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+	struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
+	struct pci_dev *dev = userdata;
+
+	if (!pdev)
+		return NULL;
+
+	/* Apply customization from firmware */
+	if (pdn && eeh_ops->restore_config)
+		eeh_ops->restore_config(pdn);
+
+	/* The caller should restore state for the specified device */
+	if (pdev != dev)
+		pci_restore_state(pdev);
+
+	return NULL;
+}
+
+int eeh_restore_vf_config(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 devctl, cmd, cap2, aer_capctl;
+	int old_mps;
+
+	if (edev->pcie_cap) {
+		/* Restore MPS */
+		old_mps = (ffs(pdn->mps) - 8) << 5;
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     2, &devctl);
+		devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
+		devctl |= old_mps;
+		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      2, devctl);
+
+		/* Disable Completion Timeout if possible */
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
+				     4, &cap2);
+		if (cap2 & PCI_EXP_DEVCAP2_COMP_TMOUT_DIS) {
+			eeh_ops->read_config(pdn,
+					     edev->pcie_cap + PCI_EXP_DEVCTL2,
+					     4, &cap2);
+			cap2 |= PCI_EXP_DEVCTL2_COMP_TMOUT_DIS;
+			eeh_ops->write_config(pdn,
+					      edev->pcie_cap + PCI_EXP_DEVCTL2,
+					      4, cap2);
+		}
+	}
+
+	/* Enable SERR and parity checking */
+	eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd);
+	cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+	eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
+
+	/* Enable report various errors */
+	if (edev->pcie_cap) {
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     2, &devctl);
+		devctl &= ~PCI_EXP_DEVCTL_CERE;
+		devctl |= (PCI_EXP_DEVCTL_NFERE |
+			   PCI_EXP_DEVCTL_FERE |
+			   PCI_EXP_DEVCTL_URRE);
+		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      2, devctl);
+	}
+
+	/* Enable ECRC generation and check */
+	if (edev->pcie_cap && edev->aer_cap) {
+		eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+				     4, &aer_capctl);
+		aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+		eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+				      4, aer_capctl);
+	}
+
+	return 0;
+}
+
+/**
+ * pcibios_set_pcie_reset_state - Set PCI-E reset state
+ * @dev: pci device struct
+ * @state: reset state to enter
+ *
+ * Return value:
+ * 	0 if success
+ */
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+	struct eeh_pe *pe = eeh_dev_to_pe(edev);
+
+	if (!pe) {
+		pr_err("%s: No PE found on PCI device %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
+
+	switch (state) {
+	case pcie_deassert_reset:
+		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+		eeh_unfreeze_pe(pe, false);
+		if (!(pe->type & EEH_PE_VF))
+			eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+		eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev);
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+		break;
+	case pcie_hot_reset:
+		eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED);
+		eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
+		eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
+		if (!(pe->type & EEH_PE_VF))
+			eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
+		break;
+	case pcie_warm_reset:
+		eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED);
+		eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
+		eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
+		if (!(pe->type & EEH_PE_VF))
+			eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+		break;
+	default:
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED);
+		return -EINVAL;
+	};
+
+	return 0;
+}
+
+/**
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collected the information for
+ * the indicated device and its children so that the bunch of the
+ * devices could be reset properly.
+ */
+static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
+{
+	struct pci_dev *dev;
+	unsigned int *freset = (unsigned int *)flag;
+
+	dev = eeh_dev_to_pci_dev(edev);
+	if (dev)
+		*freset |= dev->needs_freset;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_reset_full - Complete a full reset process on the indicated PE
+ * @pe: EEH PE
+ *
+ * This function executes a full reset procedure on a PE, including setting
+ * the appropriate flags, performing a fundamental or hot reset, and then
+ * deactivating the reset status.  It is designed to be used within the EEH
+ * subsystem, as opposed to eeh_pe_reset which is exported to drivers and
+ * only performs a single operation at a time.
+ *
+ * This function will attempt to reset a PE three times before failing.
+ */
+int eeh_pe_reset_full(struct eeh_pe *pe)
+{
+	int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED);
+	int type = EEH_RESET_HOT;
+	unsigned int freset = 0;
+	int i, state, ret;
+
+	/*
+	 * Determine the type of reset to perform - hot or fundamental.
+	 * Hot reset is the default operation, unless any device under the
+	 * PE requires a fundamental reset.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
+
+	if (freset)
+		type = EEH_RESET_FUNDAMENTAL;
+
+	/* Mark the PE as in reset state and block config space accesses */
+	eeh_pe_state_mark(pe, reset_state);
+
+	/* Make three attempts at resetting the bus */
+	for (i = 0; i < 3; i++) {
+		ret = eeh_pe_reset(pe, type);
+		if (ret)
+			break;
+
+		ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
+		if (ret)
+			break;
+
+		/* Wait until the PE is in a functioning state */
+		state = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+		if (eeh_state_active(state))
+			break;
+
+		if (state < 0) {
+			pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x",
+				__func__, pe->phb->global_number, pe->addr);
+			ret = -ENOTRECOVERABLE;
+			break;
+		}
+
+		/* Set error in case this is our last attempt */
+		ret = -EIO;
+		pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n",
+			__func__, state, pe->phb->global_number, pe->addr, (i + 1));
+	}
+
+	eeh_pe_state_clear(pe, reset_state);
+	return ret;
+}
+
+/**
+ * eeh_save_bars - Save device bars
+ * @edev: PCI device associated EEH device
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individually; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+void eeh_save_bars(struct eeh_dev *edev)
+{
+	struct pci_dn *pdn;
+	int i;
+
+	pdn = eeh_dev_to_pdn(edev);
+	if (!pdn)
+		return;
+
+	for (i = 0; i < 16; i++)
+		eeh_ops->read_config(pdn, i * 4, 4, &edev->config_space[i]);
+
+	/*
+	 * For PCI bridges including root port, we need enable bus
+	 * master explicitly. Otherwise, it can't fetch IODA table
+	 * entries correctly. So we cache the bit in advance so that
+	 * we can restore it after reset, either PHB range or PE range.
+	 */
+	if (edev->mode & EEH_DEV_BRIDGE)
+		edev->config_space[1] |= PCI_COMMAND_MASTER;
+}
+
+/**
+ * eeh_ops_register - Register platform dependent EEH operations
+ * @ops: platform dependent EEH operations
+ *
+ * Register the platform dependent EEH operation callback
+ * functions. The platform should call this function before
+ * any other EEH operations.
+ */
+int __init eeh_ops_register(struct eeh_ops *ops)
+{
+	if (!ops->name) {
+		pr_warn("%s: Invalid EEH ops name for %p\n",
+			__func__, ops);
+		return -EINVAL;
+	}
+
+	if (eeh_ops && eeh_ops != ops) {
+		pr_warn("%s: EEH ops of platform %s already existing (%s)\n",
+			__func__, eeh_ops->name, ops->name);
+		return -EEXIST;
+	}
+
+	eeh_ops = ops;
+
+	return 0;
+}
+
+/**
+ * eeh_ops_unregister - Unreigster platform dependent EEH operations
+ * @name: name of EEH platform operations
+ *
+ * Unregister the platform dependent EEH operation callback
+ * functions.
+ */
+int __exit eeh_ops_unregister(const char *name)
+{
+	if (!name || !strlen(name)) {
+		pr_warn("%s: Invalid EEH ops name\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	if (eeh_ops && !strcmp(eeh_ops->name, name)) {
+		eeh_ops = NULL;
+		return 0;
+	}
+
+	return -EEXIST;
+}
+
+static int eeh_reboot_notifier(struct notifier_block *nb,
+			       unsigned long action, void *unused)
+{
+	eeh_clear_flag(EEH_ENABLED);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_reboot_nb = {
+	.notifier_call = eeh_reboot_notifier,
+};
+
+void eeh_probe_devices(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct pci_dn *pdn;
+
+	/* Enable EEH for all adapters */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		pdn = hose->pci_data;
+		traverse_pci_dn(pdn, eeh_ops->probe, NULL);
+	}
+}
+
+/**
+ * eeh_init - EEH initialization
+ *
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check.  If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+static int eeh_init(void)
+{
+	struct pci_controller *hose, *tmp;
+	int ret = 0;
+
+	/* Register reboot notifier */
+	ret = register_reboot_notifier(&eeh_reboot_nb);
+	if (ret) {
+		pr_warn("%s: Failed to register notifier (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	/* call platform initialization function */
+	if (!eeh_ops) {
+		pr_warn("%s: Platform EEH operation not found\n",
+			__func__);
+		return -EEXIST;
+	} else if ((ret = eeh_ops->init()))
+		return ret;
+
+	/* Initialize PHB PEs */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+		eeh_dev_phb_init_dynamic(hose);
+
+	/* Initialize EEH event */
+	ret = eeh_event_init();
+	if (ret)
+		return ret;
+
+	eeh_probe_devices();
+
+	if (eeh_enabled())
+		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
+	else if (!eeh_has_flag(EEH_POSTPONED_PROBE))
+		pr_info("EEH: No capable adapters found\n");
+
+	return ret;
+}
+
+core_initcall_sync(eeh_init);
+
+/**
+ * eeh_add_device_early - Enable EEH for the indicated device node
+ * @pdn: PCI device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+void eeh_add_device_early(struct pci_dn *pdn)
+{
+	struct pci_controller *phb = pdn ? pdn->phb : NULL;
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+	if (!edev)
+		return;
+
+	if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
+		return;
+
+	/* USB Bus children of PCI devices will not have BUID's */
+	if (NULL == phb ||
+	    (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid))
+		return;
+
+	eeh_ops->probe(pdn, NULL);
+}
+
+/**
+ * eeh_add_device_tree_early - Enable EEH for the indicated device
+ * @pdn: PCI device node
+ *
+ * This routine must be used to perform EEH initialization for the
+ * indicated PCI device that was added after system boot (e.g.
+ * hotplug, dlpar).
+ */
+void eeh_add_device_tree_early(struct pci_dn *pdn)
+{
+	struct pci_dn *n;
+
+	if (!pdn)
+		return;
+
+	list_for_each_entry(n, &pdn->child_list, list)
+		eeh_add_device_tree_early(n);
+	eeh_add_device_early(pdn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
+
+/**
+ * eeh_add_device_late - Perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+void eeh_add_device_late(struct pci_dev *dev)
+{
+	struct pci_dn *pdn;
+	struct eeh_dev *edev;
+
+	if (!dev || !eeh_enabled())
+		return;
+
+	pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+	pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
+	edev = pdn_to_eeh_dev(pdn);
+	if (edev->pdev == dev) {
+		pr_debug("EEH: Already referenced !\n");
+		return;
+	}
+
+	/*
+	 * The EEH cache might not be removed correctly because of
+	 * unbalanced kref to the device during unplug time, which
+	 * relies on pcibios_release_device(). So we have to remove
+	 * that here explicitly.
+	 */
+	if (edev->pdev) {
+		eeh_rmv_from_parent_pe(edev);
+		eeh_addr_cache_rmv_dev(edev->pdev);
+		eeh_sysfs_remove_device(edev->pdev);
+		edev->mode &= ~EEH_DEV_SYSFS;
+
+		/*
+		 * We definitely should have the PCI device removed
+		 * though it wasn't correctly. So we needn't call
+		 * into error handler afterwards.
+		 */
+		edev->mode |= EEH_DEV_NO_HANDLER;
+
+		edev->pdev = NULL;
+		dev->dev.archdata.edev = NULL;
+	}
+
+	if (eeh_has_flag(EEH_PROBE_MODE_DEV))
+		eeh_ops->probe(pdn, NULL);
+
+	edev->pdev = dev;
+	dev->dev.archdata.edev = edev;
+
+	eeh_addr_cache_insert_dev(dev);
+}
+
+/**
+ * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_device_tree_late(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_device_tree_late(subbus);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
+
+/**
+ * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to add EEH sysfs files for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_sysfs_files(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_sysfs_add_device(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_sysfs_files(subbus);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
+
+/**
+ * eeh_remove_device - Undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ *
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar).  It unregisters
+ * the PCI device from the EEH subsystem.  I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
+ */
+void eeh_remove_device(struct pci_dev *dev)
+{
+	struct eeh_dev *edev;
+
+	if (!dev || !eeh_enabled())
+		return;
+	edev = pci_dev_to_eeh_dev(dev);
+
+	/* Unregister the device with the EEH/PCI address search system */
+	pr_debug("EEH: Removing device %s\n", pci_name(dev));
+
+	if (!edev || !edev->pdev || !edev->pe) {
+		pr_debug("EEH: Not referenced !\n");
+		return;
+	}
+
+	/*
+	 * During the hotplug for EEH error recovery, we need the EEH
+	 * device attached to the parent PE in order for BAR restore
+	 * a bit later. So we keep it for BAR restore and remove it
+	 * from the parent PE during the BAR resotre.
+	 */
+	edev->pdev = NULL;
+
+	/*
+	 * The flag "in_error" is used to trace EEH devices for VFs
+	 * in error state or not. It's set in eeh_report_error(). If
+	 * it's not set, eeh_report_{reset,resume}() won't be called
+	 * for the VF EEH device.
+	 */
+	edev->in_error = false;
+	dev->dev.archdata.edev = NULL;
+	if (!(edev->pe->state & EEH_PE_KEEP))
+		eeh_rmv_from_parent_pe(edev);
+	else
+		edev->mode |= EEH_DEV_DISCONNECTED;
+
+	/*
+	 * We're removing from the PCI subsystem, that means
+	 * the PCI device driver can't support EEH or not
+	 * well. So we rely on hotplug completely to do recovery
+	 * for the specific PCI device.
+	 */
+	edev->mode |= EEH_DEV_NO_HANDLER;
+
+	eeh_addr_cache_rmv_dev(dev);
+	eeh_sysfs_remove_device(dev);
+	edev->mode &= ~EEH_DEV_SYSFS;
+}
+
+int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state)
+{
+	int ret;
+
+	ret = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+	if (ret) {
+		pr_warn("%s: Failure %d enabling IO on PHB#%x-PE#%x\n",
+			__func__, ret, pe->phb->global_number, pe->addr);
+		return ret;
+	}
+
+	ret = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+	if (ret) {
+		pr_warn("%s: Failure %d enabling DMA on PHB#%x-PE#%x\n",
+			__func__, ret, pe->phb->global_number, pe->addr);
+		return ret;
+	}
+
+	/* Clear software isolated state */
+	if (sw_state && (pe->state & EEH_PE_ISOLATED))
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+
+	return ret;
+}
+
+
+static struct pci_device_id eeh_reset_ids[] = {
+	{ PCI_DEVICE(0x19a2, 0x0710) },	/* Emulex, BE     */
+	{ PCI_DEVICE(0x10df, 0xe220) },	/* Emulex, Lancer */
+	{ PCI_DEVICE(0x14e4, 0x1657) }, /* Broadcom BCM5719 */
+	{ 0 }
+};
+
+static int eeh_pe_change_owner(struct eeh_pe *pe)
+{
+	struct eeh_dev *edev, *tmp;
+	struct pci_dev *pdev;
+	struct pci_device_id *id;
+	int ret;
+
+	/* Check PE state */
+	ret = eeh_ops->get_state(pe, NULL);
+	if (ret < 0 || ret == EEH_STATE_NOT_SUPPORT)
+		return 0;
+
+	/* Unfrozen PE, nothing to do */
+	if (eeh_state_active(ret))
+		return 0;
+
+	/* Frozen PE, check if it needs PE level reset */
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (!pdev)
+			continue;
+
+		for (id = &eeh_reset_ids[0]; id->vendor != 0; id++) {
+			if (id->vendor != PCI_ANY_ID &&
+			    id->vendor != pdev->vendor)
+				continue;
+			if (id->device != PCI_ANY_ID &&
+			    id->device != pdev->device)
+				continue;
+			if (id->subvendor != PCI_ANY_ID &&
+			    id->subvendor != pdev->subsystem_vendor)
+				continue;
+			if (id->subdevice != PCI_ANY_ID &&
+			    id->subdevice != pdev->subsystem_device)
+				continue;
+
+			return eeh_pe_reset_and_recover(pe);
+		}
+	}
+
+	return eeh_unfreeze_pe(pe, true);
+}
+
+/**
+ * eeh_dev_open - Increase count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Increase count of passed through devices for the indicated
+ * PE. In the result, the EEH errors detected on the PE won't be
+ * reported. The PE owner will be responsible for detection
+ * and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+	int ret = -ENODEV;
+
+	mutex_lock(&eeh_dev_mutex);
+
+	/* No PCI device ? */
+	if (!pdev)
+		goto out;
+
+	/* No EEH device or PE ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe)
+		goto out;
+
+	/*
+	 * The PE might have been put into frozen state, but we
+	 * didn't detect that yet. The passed through PCI devices
+	 * in frozen PE won't work properly. Clear the frozen state
+	 * in advance.
+	 */
+	ret = eeh_pe_change_owner(edev->pe);
+	if (ret)
+		goto out;
+
+	/* Increase PE's pass through count */
+	atomic_inc(&edev->pe->pass_dev_cnt);
+	mutex_unlock(&eeh_dev_mutex);
+
+	return 0;
+out:
+	mutex_unlock(&eeh_dev_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Decrease count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Decrease count of pass through devices for the indicated PE. If
+ * there is no passed through device in PE, the EEH errors detected
+ * on the PE will be reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+
+	mutex_lock(&eeh_dev_mutex);
+
+	/* No PCI device ? */
+	if (!pdev)
+		goto out;
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe || !eeh_pe_passed(edev->pe))
+		goto out;
+
+	/* Decrease PE's pass through count */
+	WARN_ON(atomic_dec_if_positive(&edev->pe->pass_dev_cnt) < 0);
+	eeh_pe_change_owner(edev->pe);
+out:
+	mutex_unlock(&eeh_dev_mutex);
+}
+EXPORT_SYMBOL(eeh_dev_release);
+
+#ifdef CONFIG_IOMMU_API
+
+static int dev_has_iommu_table(struct device *dev, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_dev **ppdev = data;
+
+	if (!dev)
+		return 0;
+
+	if (dev->iommu_group) {
+		*ppdev = pdev;
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_iommu_group_to_pe - Convert IOMMU group to EEH PE
+ * @group: IOMMU group
+ *
+ * The routine is called to convert IOMMU group to EEH PE.
+ */
+struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group)
+{
+	struct pci_dev *pdev = NULL;
+	struct eeh_dev *edev;
+	int ret;
+
+	/* No IOMMU group ? */
+	if (!group)
+		return NULL;
+
+	ret = iommu_group_for_each_dev(group, &pdev, dev_has_iommu_table);
+	if (!ret || !pdev)
+		return NULL;
+
+	/* No EEH device or PE ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe)
+		return NULL;
+
+	return edev->pe;
+}
+EXPORT_SYMBOL_GPL(eeh_iommu_group_to_pe);
+
+#endif /* CONFIG_IOMMU_API */
+
+/**
+ * eeh_pe_set_option - Set options for the indicated PE
+ * @pe: EEH PE
+ * @option: requested option
+ *
+ * The routine is called to enable or disable EEH functionality
+ * on the indicated PE, to enable IO or DMA for the frozen PE.
+ */
+int eeh_pe_set_option(struct eeh_pe *pe, int option)
+{
+	int ret = 0;
+
+	/* Invalid PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	/*
+	 * EEH functionality could possibly be disabled, just
+	 * return error for the case. And the EEH functinality
+	 * isn't expected to be disabled on one specific PE.
+	 */
+	switch (option) {
+	case EEH_OPT_ENABLE:
+		if (eeh_enabled()) {
+			ret = eeh_pe_change_owner(pe);
+			break;
+		}
+		ret = -EIO;
+		break;
+	case EEH_OPT_DISABLE:
+		break;
+	case EEH_OPT_THAW_MMIO:
+	case EEH_OPT_THAW_DMA:
+	case EEH_OPT_FREEZE_PE:
+		if (!eeh_ops || !eeh_ops->set_option) {
+			ret = -ENOENT;
+			break;
+		}
+
+		ret = eeh_pci_enable(pe, option);
+		break;
+	default:
+		pr_debug("%s: Option %d out of range (%d, %d)\n",
+			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_set_option);
+
+/**
+ * eeh_pe_get_state - Retrieve PE's state
+ * @pe: EEH PE
+ *
+ * Retrieve the PE's state, which includes 3 aspects: enabled
+ * DMA, enabled IO and asserted reset.
+ */
+int eeh_pe_get_state(struct eeh_pe *pe)
+{
+	int result, ret = 0;
+	bool rst_active, dma_en, mmio_en;
+
+	/* Existing PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	if (!eeh_ops || !eeh_ops->get_state)
+		return -ENOENT;
+
+	/*
+	 * If the parent PE is owned by the host kernel and is undergoing
+	 * error recovery, we should return the PE state as temporarily
+	 * unavailable so that the error recovery on the guest is suspended
+	 * until the recovery completes on the host.
+	 */
+	if (pe->parent &&
+	    !(pe->state & EEH_PE_REMOVED) &&
+	    (pe->parent->state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
+		return EEH_PE_STATE_UNAVAIL;
+
+	result = eeh_ops->get_state(pe, NULL);
+	rst_active = !!(result & EEH_STATE_RESET_ACTIVE);
+	dma_en = !!(result & EEH_STATE_DMA_ENABLED);
+	mmio_en = !!(result & EEH_STATE_MMIO_ENABLED);
+
+	if (rst_active)
+		ret = EEH_PE_STATE_RESET;
+	else if (dma_en && mmio_en)
+		ret = EEH_PE_STATE_NORMAL;
+	else if (!dma_en && !mmio_en)
+		ret = EEH_PE_STATE_STOPPED_IO_DMA;
+	else if (!dma_en && mmio_en)
+		ret = EEH_PE_STATE_STOPPED_DMA;
+	else
+		ret = EEH_PE_STATE_UNAVAIL;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_get_state);
+
+static int eeh_pe_reenable_devices(struct eeh_pe *pe)
+{
+	struct eeh_dev *edev, *tmp;
+	struct pci_dev *pdev;
+	int ret = 0;
+
+	/* Restore config space */
+	eeh_pe_restore_bars(pe);
+
+	/*
+	 * Reenable PCI devices as the devices passed
+	 * through are always enabled before the reset.
+	 */
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (!pdev)
+			continue;
+
+		ret = pci_reenable_device(pdev);
+		if (ret) {
+			pr_warn("%s: Failure %d reenabling %s\n",
+				__func__, ret, pci_name(pdev));
+			return ret;
+		}
+	}
+
+	/* The PE is still in frozen state */
+	return eeh_unfreeze_pe(pe, true);
+}
+
+
+/**
+ * eeh_pe_reset - Issue PE reset according to specified type
+ * @pe: EEH PE
+ * @option: reset type
+ *
+ * The routine is called to reset the specified PE with the
+ * indicated type, either fundamental reset or hot reset.
+ * PE reset is the most important part for error recovery.
+ */
+int eeh_pe_reset(struct eeh_pe *pe, int option)
+{
+	int ret = 0;
+
+	/* Invalid PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
+		return -ENOENT;
+
+	switch (option) {
+	case EEH_RESET_DEACTIVATE:
+		ret = eeh_ops->reset(pe, option);
+		eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+		if (ret)
+			break;
+
+		ret = eeh_pe_reenable_devices(pe);
+		break;
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		/*
+		 * Proactively freeze the PE to drop all MMIO access
+		 * during reset, which should be banned as it's always
+		 * cause recursive EEH error.
+		 */
+		eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
+
+		eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+		ret = eeh_ops->reset(pe, option);
+		break;
+	default:
+		pr_debug("%s: Unsupported option %d\n",
+			__func__, option);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_reset);
+
+/**
+ * eeh_pe_configure - Configure PCI bridges after PE reset
+ * @pe: EEH PE
+ *
+ * The routine is called to restore the PCI config space for
+ * those PCI devices, especially PCI bridges affected by PE
+ * reset issued previously.
+ */
+int eeh_pe_configure(struct eeh_pe *pe)
+{
+	int ret = 0;
+
+	/* Invalid PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_configure);
+
+/**
+ * eeh_pe_inject_err - Injecting the specified PCI error to the indicated PE
+ * @pe: the indicated PE
+ * @type: error type
+ * @function: error function
+ * @addr: address
+ * @mask: address mask
+ *
+ * The routine is called to inject the specified PCI error, which
+ * is determined by @type and @function, to the indicated PE for
+ * testing purpose.
+ */
+int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
+		      unsigned long addr, unsigned long mask)
+{
+	/* Invalid PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	/* Unsupported operation ? */
+	if (!eeh_ops || !eeh_ops->err_inject)
+		return -ENOENT;
+
+	/* Check on PCI error type */
+	if (type != EEH_ERR_TYPE_32 && type != EEH_ERR_TYPE_64)
+		return -EINVAL;
+
+	/* Check on PCI error function */
+	if (func < EEH_ERR_FUNC_MIN || func > EEH_ERR_FUNC_MAX)
+		return -EINVAL;
+
+	return eeh_ops->err_inject(pe, type, func, addr, mask);
+}
+EXPORT_SYMBOL_GPL(eeh_pe_inject_err);
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+	if (!eeh_enabled()) {
+		seq_printf(m, "EEH Subsystem is globally disabled\n");
+		seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
+	} else {
+		seq_printf(m, "EEH Subsystem is enabled\n");
+		seq_printf(m,
+				"no device=%llu\n"
+				"no device node=%llu\n"
+				"no config address=%llu\n"
+				"check not wanted=%llu\n"
+				"eeh_total_mmio_ffs=%llu\n"
+				"eeh_false_positives=%llu\n"
+				"eeh_slot_resets=%llu\n",
+				eeh_stats.no_device,
+				eeh_stats.no_dn,
+				eeh_stats.no_cfg_addr,
+				eeh_stats.ignored_check,
+				eeh_stats.total_mmio_ffs,
+				eeh_stats.false_positives,
+				eeh_stats.slot_resets);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int eeh_enable_dbgfs_set(void *data, u64 val)
+{
+	if (val)
+		eeh_clear_flag(EEH_FORCE_DISABLED);
+	else
+		eeh_add_flag(EEH_FORCE_DISABLED);
+
+	return 0;
+}
+
+static int eeh_enable_dbgfs_get(void *data, u64 *val)
+{
+	if (eeh_enabled())
+		*val = 0x1ul;
+	else
+		*val = 0x0ul;
+	return 0;
+}
+
+static int eeh_freeze_dbgfs_set(void *data, u64 val)
+{
+	eeh_max_freezes = val;
+	return 0;
+}
+
+static int eeh_freeze_dbgfs_get(void *data, u64 *val)
+{
+	*val = eeh_max_freezes;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
+			eeh_enable_dbgfs_set, "0x%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get,
+			eeh_freeze_dbgfs_set, "0x%llx\n");
+#endif
+
+static int __init eeh_init_proc(void)
+{
+	if (machine_is(pseries) || machine_is(powernv)) {
+		proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show);
+#ifdef CONFIG_DEBUG_FS
+		debugfs_create_file("eeh_enable", 0600,
+                                    powerpc_debugfs_root, NULL,
+                                    &eeh_enable_dbgfs_ops);
+		debugfs_create_file("eeh_max_freezes", 0600,
+				    powerpc_debugfs_root, NULL,
+				    &eeh_freeze_dbgfs_ops);
+#endif
+	}
+
+	return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
new file mode 100644
index 000000000..201943d54
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -0,0 +1,306 @@
+/*
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem.  This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range {
+	struct rb_node rb_node;
+	resource_size_t addr_lo;
+	resource_size_t addr_hi;
+	struct eeh_dev *edev;
+	struct pci_dev *pcidev;
+	unsigned long flags;
+};
+
+static struct pci_io_addr_cache {
+	struct rb_root rb_root;
+	spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
+{
+	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (addr < piar->addr_lo)
+			n = n->rb_left;
+		else if (addr > piar->addr_hi)
+			n = n->rb_right;
+		else
+			return piar->edev;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_addr_cache_get_dev - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address.  I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
+{
+	struct eeh_dev *edev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	edev = __eeh_addr_cache_get_device(addr);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+	return edev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+	struct rb_node *n;
+	int cnt = 0;
+
+	n = rb_first(&cache->rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+		pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n",
+		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+		       &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev));
+		cnt++;
+		n = rb_next(n);
+	}
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
+		      resource_size_t ahi, unsigned long flags)
+{
+	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pci_io_addr_range *piar;
+
+	/* Walk tree, find a place to insert into tree */
+	while (*p) {
+		parent = *p;
+		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+		if (ahi < piar->addr_lo) {
+			p = &parent->rb_left;
+		} else if (alo > piar->addr_hi) {
+			p = &parent->rb_right;
+		} else {
+			if (dev != piar->pcidev ||
+			    alo != piar->addr_lo || ahi != piar->addr_hi) {
+				pr_warn("PIAR: overlapping address range\n");
+			}
+			return piar;
+		}
+	}
+	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	if (!piar)
+		return NULL;
+
+	piar->addr_lo = alo;
+	piar->addr_hi = ahi;
+	piar->edev = pci_dev_to_eeh_dev(dev);
+	piar->pcidev = dev;
+	piar->flags = flags;
+
+#ifdef DEBUG
+	pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
+		 &alo, &ahi, pci_name(dev));
+#endif
+
+	rb_link_node(&piar->rb_node, parent, p);
+	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+	return piar;
+}
+
+static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+	struct pci_dn *pdn;
+	struct eeh_dev *edev;
+	int i;
+
+	pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
+	if (!pdn) {
+		pr_warn("PCI: no pci dn found for dev=%s\n",
+			pci_name(dev));
+		return;
+	}
+
+	edev = pdn_to_eeh_dev(pdn);
+	if (!edev) {
+		pr_warn("PCI: no EEH dev found for %s\n",
+			pci_name(dev));
+		return;
+	}
+
+	/* Skip any devices for which EEH is not enabled. */
+	if (!edev->pe) {
+		dev_dbg(&dev->dev, "EEH: Skip building address cache\n");
+		return;
+	}
+
+	/*
+	 * Walk resources on this device, poke the first 7 (6 normal BAR and 1
+	 * ROM BAR) into the tree.
+	 */
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+		resource_size_t start = pci_resource_start(dev,i);
+		resource_size_t end = pci_resource_end(dev,i);
+		unsigned long flags = pci_resource_flags(dev,i);
+
+		/* We are interested only bus addresses, not dma or other stuff */
+		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+			 continue;
+		eeh_addr_cache_insert(dev, start, end, flags);
+	}
+}
+
+/**
+ * eeh_addr_cache_insert_dev - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__eeh_addr_cache_insert_dev(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+	struct rb_node *n;
+
+restart:
+	n = rb_first(&pci_io_addr_cache_root.rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (piar->pcidev == dev) {
+			rb_erase(n, &pci_io_addr_cache_root.rb_root);
+			kfree(piar);
+			goto restart;
+		}
+		n = rb_next(n);
+	}
+}
+
+/**
+ * eeh_addr_cache_rmv_dev - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__eeh_addr_cache_rmv_dev(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * eeh_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void eeh_addr_cache_build(void)
+{
+	struct pci_dn *pdn;
+	struct eeh_dev *edev;
+	struct pci_dev *dev = NULL;
+
+	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+	for_each_pci_dev(dev) {
+		pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
+		if (!pdn)
+			continue;
+
+		edev = pdn_to_eeh_dev(pdn);
+		if (!edev)
+			continue;
+
+		dev->dev.archdata.edev = edev;
+		edev->pdev = dev;
+
+		eeh_addr_cache_insert_dev(dev);
+		eeh_sysfs_add_device(dev);
+	}
+
+#ifdef DEBUG
+	/* Verify tree built up above, echo back the list of addrs. */
+	eeh_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
new file mode 100644
index 000000000..a34e6912c
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -0,0 +1,80 @@
+/*
+ * The file intends to implement dynamic creation of EEH device, which will
+ * be bound with OF node and PCI device simutaneously. The EEH devices would
+ * be foundamental information for EEH core components to work proerly. Besides,
+ * We have to support multiple situations where dynamic creation of EEH device
+ * is required:
+ *
+ * 1) Before PCI emunation starts, we need create EEH devices according to the
+ *    PCI sensitive OF nodes.
+ * 2) When PCI emunation is done, we need do the binding between PCI device and
+ *    the associated EEH device.
+ * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
+ *    will be created while PCI sensitive OF node is detected from DR.
+ * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
+ *    PHB is newly inserted, we also need create EEH devices accordingly.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+/**
+ * eeh_dev_init - Create EEH device according to OF node
+ * @pdn: PCI device node
+ *
+ * It will create EEH device according to the given OF node. The function
+ * might be called by PCI emunation, DR, PHB hotplug.
+ */
+struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev;
+
+	/* Allocate EEH device */
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+	if (!edev)
+		return NULL;
+
+	/* Associate EEH device with OF node */
+	pdn->edev = edev;
+	edev->pdn = pdn;
+	INIT_LIST_HEAD(&edev->list);
+	INIT_LIST_HEAD(&edev->rmv_list);
+
+	return edev;
+}
+
+/**
+ * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
+ * @phb: PHB
+ *
+ * Scan the PHB OF node and its child association, then create the
+ * EEH devices accordingly
+ */
+void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
+{
+	/* EEH PE for PHB */
+	eeh_phb_pe_create(phb);
+}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
new file mode 100644
index 000000000..377d23f58
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -0,0 +1,1112 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+struct eeh_rmv_data {
+	struct list_head edev_list;
+	int removed;
+};
+
+static int eeh_result_priority(enum pci_ers_result result)
+{
+	switch (result) {
+	case PCI_ERS_RESULT_NONE:
+		return 1;
+	case PCI_ERS_RESULT_NO_AER_DRIVER:
+		return 2;
+	case PCI_ERS_RESULT_RECOVERED:
+		return 3;
+	case PCI_ERS_RESULT_CAN_RECOVER:
+		return 4;
+	case PCI_ERS_RESULT_DISCONNECT:
+		return 5;
+	case PCI_ERS_RESULT_NEED_RESET:
+		return 6;
+	default:
+		WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result);
+		return 0;
+	}
+};
+
+const char *pci_ers_result_name(enum pci_ers_result result)
+{
+	switch (result) {
+	case PCI_ERS_RESULT_NONE:
+		return "none";
+	case PCI_ERS_RESULT_CAN_RECOVER:
+		return "can recover";
+	case PCI_ERS_RESULT_NEED_RESET:
+		return "need reset";
+	case PCI_ERS_RESULT_DISCONNECT:
+		return "disconnect";
+	case PCI_ERS_RESULT_RECOVERED:
+		return "recovered";
+	case PCI_ERS_RESULT_NO_AER_DRIVER:
+		return "no AER driver";
+	default:
+		WARN_ONCE(1, "Unknown result type: %d\n", (int)result);
+		return "unknown";
+	}
+};
+
+static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
+					 const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
+	       edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf);
+
+	va_end(args);
+}
+
+static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
+						enum pci_ers_result new)
+{
+	if (eeh_result_priority(new) > eeh_result_priority(old))
+		return new;
+	return old;
+}
+
+static bool eeh_dev_removed(struct eeh_dev *edev)
+{
+	return !edev || (edev->mode & EEH_DEV_REMOVED);
+}
+
+static bool eeh_edev_actionable(struct eeh_dev *edev)
+{
+	return (edev->pdev && !eeh_dev_removed(edev) &&
+		!eeh_pe_passed(edev->pe));
+}
+
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return NULL;
+
+	if (!try_module_get(pdev->driver->driver.owner))
+		return NULL;
+
+	return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return;
+
+	module_put(pdev->driver->driver.owner);
+}
+
+/**
+ * eeh_disable_irq - Disable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called when reporting temporary or permanent
+ * error to the particular PCI device to disable interrupt of that
+ * device. If the device has enabled MSI or MSI-X interrupt, we needn't
+ * do real work because EEH should freeze DMA transfers for those PCI
+ * devices encountering EEH errors, which includes MSI or MSI-X.
+ */
+static void eeh_disable_irq(struct eeh_dev *edev)
+{
+	/* Don't disable MSI and MSI-X interrupts. They are
+	 * effectively disabled by the DMA Stopped state
+	 * when an EEH error occurs.
+	 */
+	if (edev->pdev->msi_enabled || edev->pdev->msix_enabled)
+		return;
+
+	if (!irq_has_action(edev->pdev->irq))
+		return;
+
+	edev->mode |= EEH_DEV_IRQ_DISABLED;
+	disable_irq_nosync(edev->pdev->irq);
+}
+
+/**
+ * eeh_enable_irq - Enable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called to enable interrupt while failed
+ * device could be resumed.
+ */
+static void eeh_enable_irq(struct eeh_dev *edev)
+{
+	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
+		/*
+		 * FIXME !!!!!
+		 *
+		 * This is just ass backwards. This maze has
+		 * unbalanced irq_enable/disable calls. So instead of
+		 * finding the root cause it works around the warning
+		 * in the irq_enable code by conditionally calling
+		 * into it.
+		 *
+		 * That's just wrong.The warning in the core code is
+		 * there to tell people to fix their asymmetries in
+		 * their own code, not by abusing the core information
+		 * to avoid it.
+		 *
+		 * I so wish that the assymetry would be the other way
+		 * round and a few more irq_disable calls render that
+		 * shit unusable forever.
+		 *
+		 *	tglx
+		 */
+		if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq)))
+			enable_irq(edev->pdev->irq);
+	}
+}
+
+static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
+{
+	struct pci_dev *pdev;
+
+	if (!edev)
+		return NULL;
+
+	/*
+	 * We cannot access the config space on some adapters.
+	 * Otherwise, it will cause fenced PHB. We don't save
+	 * the content in their config space and will restore
+	 * from the initial config space saved when the EEH
+	 * device is created.
+	 */
+	if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
+		return NULL;
+
+	pdev = eeh_dev_to_pci_dev(edev);
+	if (!pdev)
+		return NULL;
+
+	pci_save_state(pdev);
+	return NULL;
+}
+
+static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev, *tmp;
+
+	eeh_for_each_pe(root, pe)
+		eeh_pe_for_each_dev(pe, edev, tmp)
+			if (eeh_edev_actionable(edev))
+				edev->pdev->error_state = s;
+}
+
+static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev, *tmp;
+
+	eeh_for_each_pe(root, pe) {
+		eeh_pe_for_each_dev(pe, edev, tmp) {
+			if (!eeh_edev_actionable(edev))
+				continue;
+
+			if (!eeh_pcid_get(edev->pdev))
+				continue;
+
+			if (enable)
+				eeh_enable_irq(edev);
+			else
+				eeh_disable_irq(edev);
+
+			eeh_pcid_put(edev->pdev);
+		}
+	}
+}
+
+typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
+					     struct pci_driver *);
+static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
+			       enum pci_ers_result *result)
+{
+	struct pci_driver *driver;
+	enum pci_ers_result new_result;
+
+	if (!edev->pdev) {
+		eeh_edev_info(edev, "no device");
+		return;
+	}
+	device_lock(&edev->pdev->dev);
+	if (eeh_edev_actionable(edev)) {
+		driver = eeh_pcid_get(edev->pdev);
+
+		if (!driver)
+			eeh_edev_info(edev, "no driver");
+		else if (!driver->err_handler)
+			eeh_edev_info(edev, "driver not EEH aware");
+		else if (edev->mode & EEH_DEV_NO_HANDLER)
+			eeh_edev_info(edev, "driver bound too late");
+		else {
+			new_result = fn(edev, driver);
+			eeh_edev_info(edev, "%s driver reports: '%s'",
+				      driver->name,
+				      pci_ers_result_name(new_result));
+			if (result)
+				*result = pci_ers_merge_result(*result,
+							       new_result);
+		}
+		if (driver)
+			eeh_pcid_put(edev->pdev);
+	} else {
+		eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev,
+			      !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
+	}
+	device_unlock(&edev->pdev->dev);
+}
+
+static void eeh_pe_report(const char *name, struct eeh_pe *root,
+			  eeh_report_fn fn, enum pci_ers_result *result)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev, *tmp;
+
+	pr_info("EEH: Beginning: '%s'\n", name);
+	eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
+		eeh_pe_report_edev(edev, fn, result);
+	if (result)
+		pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
+			name, pci_ers_result_name(*result));
+	else
+		pr_info("EEH: Finished:'%s'", name);
+}
+
+/**
+ * eeh_report_error - Report pci error to each device driver
+ * @edev: eeh device
+ * @driver: device's PCI driver
+ *
+ * Report an EEH error to each device driver.
+ */
+static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
+					    struct pci_driver *driver)
+{
+	enum pci_ers_result rc;
+	struct pci_dev *dev = edev->pdev;
+
+	if (!driver->err_handler->error_detected)
+		return PCI_ERS_RESULT_NONE;
+
+	eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
+		      driver->name);
+	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+
+	edev->in_error = true;
+	pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
+	return rc;
+}
+
+/**
+ * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
+ * @edev: eeh device
+ * @driver: device's PCI driver
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled.
+ */
+static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
+						   struct pci_driver *driver)
+{
+	if (!driver->err_handler->mmio_enabled)
+		return PCI_ERS_RESULT_NONE;
+	eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
+	return driver->err_handler->mmio_enabled(edev->pdev);
+}
+
+/**
+ * eeh_report_reset - Tell device that slot has been reset
+ * @edev: eeh device
+ * @driver: device's PCI driver
+ *
+ * This routine must be called while EEH tries to reset particular
+ * PCI device so that the associated PCI device driver could take
+ * some actions, usually to save data the driver needs so that the
+ * driver can work again while the device is recovered.
+ */
+static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
+					    struct pci_driver *driver)
+{
+	if (!driver->err_handler->slot_reset || !edev->in_error)
+		return PCI_ERS_RESULT_NONE;
+	eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
+	return driver->err_handler->slot_reset(edev->pdev);
+}
+
+static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
+{
+	struct pci_dev *pdev;
+
+	if (!edev)
+		return NULL;
+
+	/*
+	 * The content in the config space isn't saved because
+	 * the blocked config space on some adapters. We have
+	 * to restore the initial saved config space when the
+	 * EEH device is created.
+	 */
+	if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
+		if (list_is_last(&edev->list, &edev->pe->edevs))
+			eeh_pe_restore_bars(edev->pe);
+
+		return NULL;
+	}
+
+	pdev = eeh_dev_to_pci_dev(edev);
+	if (!pdev)
+		return NULL;
+
+	pci_restore_state(pdev);
+	return NULL;
+}
+
+/**
+ * eeh_report_resume - Tell device to resume normal operations
+ * @edev: eeh device
+ * @driver: device's PCI driver
+ *
+ * This routine must be called to notify the device driver that it
+ * could resume so that the device driver can do some initialization
+ * to make the recovered device work again.
+ */
+static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
+					     struct pci_driver *driver)
+{
+	if (!driver->err_handler->resume || !edev->in_error)
+		return PCI_ERS_RESULT_NONE;
+
+	eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
+	driver->err_handler->resume(edev->pdev);
+
+	pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
+#ifdef CONFIG_PCI_IOV
+	if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
+		eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
+#endif
+	return PCI_ERS_RESULT_NONE;
+}
+
+/**
+ * eeh_report_failure - Tell device driver that device is dead.
+ * @edev: eeh device
+ * @driver: device's PCI driver
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
+					      struct pci_driver *driver)
+{
+	enum pci_ers_result rc;
+
+	if (!driver->err_handler->error_detected)
+		return PCI_ERS_RESULT_NONE;
+
+	eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
+		      driver->name);
+	rc = driver->err_handler->error_detected(edev->pdev,
+						 pci_channel_io_perm_failure);
+
+	pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT);
+	return rc;
+}
+
+static void *eeh_add_virt_device(void *data, void *userdata)
+{
+	struct pci_driver *driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	if (!(edev->physfn)) {
+		pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
+			__func__, pdn->phb->global_number, pdn->busno,
+			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+		return NULL;
+	}
+
+	driver = eeh_pcid_get(dev);
+	if (driver) {
+		if (driver->err_handler) {
+			eeh_pcid_put(dev);
+			return NULL;
+		}
+		eeh_pcid_put(dev);
+	}
+
+#ifdef CONFIG_PCI_IOV
+	pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
+#endif
+	return NULL;
+}
+
+static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
+{
+	struct pci_driver *driver;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
+	int *removed = rmv_data ? &rmv_data->removed : NULL;
+
+	/*
+	 * Actually, we should remove the PCI bridges as well.
+	 * However, that's lots of complexity to do that,
+	 * particularly some of devices under the bridge might
+	 * support EEH. So we just care about PCI devices for
+	 * simplicity here.
+	 */
+	if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
+		return NULL;
+
+	/*
+	 * We rely on count-based pcibios_release_device() to
+	 * detach permanently offlined PEs. Unfortunately, that's
+	 * not reliable enough. We might have the permanently
+	 * offlined PEs attached, but we needn't take care of
+	 * them and their child devices.
+	 */
+	if (eeh_dev_removed(edev))
+		return NULL;
+
+	if (removed) {
+		if (eeh_pe_passed(edev->pe))
+			return NULL;
+		driver = eeh_pcid_get(dev);
+		if (driver) {
+			if (driver->err_handler &&
+			    driver->err_handler->error_detected &&
+			    driver->err_handler->slot_reset) {
+				eeh_pcid_put(dev);
+				return NULL;
+			}
+			eeh_pcid_put(dev);
+		}
+	}
+
+	/* Remove it from PCI subsystem */
+	pr_debug("EEH: Removing %s without EEH sensitive driver\n",
+		 pci_name(dev));
+	edev->bus = dev->bus;
+	edev->mode |= EEH_DEV_DISCONNECTED;
+	if (removed)
+		(*removed)++;
+
+	if (edev->physfn) {
+#ifdef CONFIG_PCI_IOV
+		struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
+		edev->pdev = NULL;
+#endif
+		if (rmv_data)
+			list_add(&edev->rmv_list, &rmv_data->edev_list);
+	} else {
+		pci_lock_rescan_remove();
+		pci_stop_and_remove_bus_device(dev);
+		pci_unlock_rescan_remove();
+	}
+
+	return NULL;
+}
+
+static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
+{
+	struct eeh_dev *edev, *tmp;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (!(edev->mode & EEH_DEV_DISCONNECTED))
+			continue;
+
+		edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
+		eeh_rmv_from_parent_pe(edev);
+	}
+
+	return NULL;
+}
+
+/*
+ * Explicitly clear PE's frozen state for PowerNV where
+ * we have frozen PE until BAR restore is completed. It's
+ * harmless to clear it for pSeries. To be consistent with
+ * PE reset (for 3 times), we try to clear the frozen state
+ * for 3 times as well.
+ */
+static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag)
+{
+	bool clear_sw_state = *(bool *)flag;
+	int i, rc = 1;
+
+	for (i = 0; rc && i < 3; i++)
+		rc = eeh_unfreeze_pe(pe, clear_sw_state);
+
+	/* Stop immediately on any errors */
+	if (rc) {
+		pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n",
+			__func__, rc, pe->phb->global_number, pe->addr);
+		return (void *)pe;
+	}
+
+	return NULL;
+}
+
+static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
+				     bool clear_sw_state)
+{
+	void *rc;
+
+	rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state);
+	if (!rc)
+		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+
+	return rc ? -EIO : 0;
+}
+
+int eeh_pe_reset_and_recover(struct eeh_pe *pe)
+{
+	int ret;
+
+	/* Bail if the PE is being recovered */
+	if (pe->state & EEH_PE_RECOVERING)
+		return 0;
+
+	/* Put the PE into recovery mode */
+	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+
+	/* Save states */
+	eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
+
+	/* Issue reset */
+	ret = eeh_pe_reset_full(pe);
+	if (ret) {
+		eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+		return ret;
+	}
+
+	/* Unfreeze the PE */
+	ret = eeh_clear_pe_frozen_state(pe, true);
+	if (ret) {
+		eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+		return ret;
+	}
+
+	/* Restore device state */
+	eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
+
+	/* Clear recovery mode */
+	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+
+	return 0;
+}
+
+/**
+ * eeh_reset_device - Perform actual reset of a pci slot
+ * @driver_eeh_aware: Does the device's driver provide EEH support?
+ * @pe: EEH PE
+ * @bus: PCI bus corresponding to the isolcated slot
+ * @rmv_data: Optional, list to record removed devices
+ *
+ * This routine must be called to do reset on the indicated PE.
+ * During the reset, udev might be invoked because those affected
+ * PCI devices will be removed and then added.
+ */
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
+			    struct eeh_rmv_data *rmv_data,
+			    bool driver_eeh_aware)
+{
+	time64_t tstamp;
+	int cnt, rc;
+	struct eeh_dev *edev;
+
+	/* pcibios will clear the counter; save the value */
+	cnt = pe->freeze_count;
+	tstamp = pe->tstamp;
+
+	/*
+	 * We don't remove the corresponding PE instances because
+	 * we need the information afterwords. The attached EEH
+	 * devices are expected to be attached soon when calling
+	 * into pci_hp_add_devices().
+	 */
+	eeh_pe_state_mark(pe, EEH_PE_KEEP);
+	if (driver_eeh_aware || (pe->type & EEH_PE_VF)) {
+		eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
+	} else {
+		pci_lock_rescan_remove();
+		pci_hp_remove_devices(bus);
+		pci_unlock_rescan_remove();
+	}
+
+	/*
+	 * Reset the pci controller. (Asserts RST#; resets config space).
+	 * Reconfigure bridges and devices. Don't try to bring the system
+	 * up if the reset failed for some reason.
+	 *
+	 * During the reset, it's very dangerous to have uncontrolled PCI
+	 * config accesses. So we prefer to block them. However, controlled
+	 * PCI config accesses initiated from EEH itself are allowed.
+	 */
+	rc = eeh_pe_reset_full(pe);
+	if (rc)
+		return rc;
+
+	pci_lock_rescan_remove();
+
+	/* Restore PE */
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
+
+	/* Clear frozen state */
+	rc = eeh_clear_pe_frozen_state(pe, false);
+	if (rc) {
+		pci_unlock_rescan_remove();
+		return rc;
+	}
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
+	 * this is a hack, but if we don't do this, and try to bring
+	 * the device up before the scripts have taken it down,
+	 * potentially weird things happen.
+	 */
+	if (!driver_eeh_aware || rmv_data->removed) {
+		pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
+			(driver_eeh_aware ? "partial" : "complete"));
+		ssleep(5);
+
+		/*
+		 * The EEH device is still connected with its parent
+		 * PE. We should disconnect it so the binding can be
+		 * rebuilt when adding PCI devices.
+		 */
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
+		if (pe->type & EEH_PE_VF) {
+			eeh_add_virt_device(edev, NULL);
+		} else {
+			if (!driver_eeh_aware)
+				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+			pci_hp_add_devices(bus);
+		}
+	}
+	eeh_pe_state_clear(pe, EEH_PE_KEEP);
+
+	pe->tstamp = tstamp;
+	pe->freeze_count = cnt;
+
+	pci_unlock_rescan_remove();
+	return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 300
+
+/**
+ * eeh_handle_normal_event - Handle EEH events on a specific PE
+ * @pe: EEH PE - which should not be used after we return, as it may
+ * have been invalidated.
+ *
+ * Attempts to recover the given PE.  If recovery fails or the PE has failed
+ * too many times, remove the PE.
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_normal_event(struct eeh_pe *pe)
+{
+	struct pci_bus *bus;
+	struct eeh_dev *edev, *tmp;
+	struct eeh_pe *tmp_pe;
+	int rc = 0;
+	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+	struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
+
+	bus = eeh_pe_bus_get(pe);
+	if (!bus) {
+		pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
+			__func__, pe->phb->global_number, pe->addr);
+		return;
+	}
+
+	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+
+	eeh_pe_update_time_stamp(pe);
+	pe->freeze_count++;
+	if (pe->freeze_count > eeh_max_freezes) {
+		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
+		       pe->phb->global_number, pe->addr,
+		       pe->freeze_count);
+		goto hard_fail;
+	}
+	pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
+		pe->freeze_count, eeh_max_freezes);
+
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			edev->mode &= ~EEH_DEV_NO_HANDLER;
+
+	/* Walk the various device drivers attached to this slot through
+	 * a reset sequence, giving each an opportunity to do what it needs
+	 * to accomplish the reset.  Each child gets a report of the
+	 * status ... if any child can't handle the reset, then the entire
+	 * slot is dlpar removed and added.
+	 *
+	 * When the PHB is fenced, we have to issue a reset to recover from
+	 * the error. Override the result if necessary to have partially
+	 * hotplug for this case.
+	 */
+	pr_info("EEH: Notify device drivers to shutdown\n");
+	eeh_set_channel_state(pe, pci_channel_io_frozen);
+	eeh_set_irq_state(pe, false);
+	eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error,
+		      &result);
+	if ((pe->type & EEH_PE_PHB) &&
+	    result != PCI_ERS_RESULT_NONE &&
+	    result != PCI_ERS_RESULT_NEED_RESET)
+		result = PCI_ERS_RESULT_NEED_RESET;
+
+	/* Get the current PCI slot state. This can take a long time,
+	 * sometimes over 300 seconds for certain systems.
+	 */
+	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
+	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+		pr_warn("EEH: Permanent failure\n");
+		goto hard_fail;
+	}
+
+	/* Since rtas may enable MMIO when posting the error log,
+	 * don't post the error log until after all dev drivers
+	 * have been informed.
+	 */
+	pr_info("EEH: Collect temporary log\n");
+	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
+
+	/* If all device drivers were EEH-unaware, then shut
+	 * down all of the device drivers, and hope they
+	 * go down willingly, without panicing the system.
+	 */
+	if (result == PCI_ERS_RESULT_NONE) {
+		pr_info("EEH: Reset with hotplug activity\n");
+		rc = eeh_reset_device(pe, bus, NULL, false);
+		if (rc) {
+			pr_warn("%s: Unable to reset, err=%d\n",
+				__func__, rc);
+			goto hard_fail;
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable MMIO */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		pr_info("EEH: Enable I/O for affected devices\n");
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc) {
+			result = PCI_ERS_RESULT_NEED_RESET;
+		} else {
+			pr_info("EEH: Notify device drivers to resume I/O\n");
+			eeh_pe_report("mmio_enabled", pe,
+				      eeh_report_mmio_enabled, &result);
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable DMA */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		pr_info("EEH: Enabled DMA for affected devices\n");
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc) {
+			result = PCI_ERS_RESULT_NEED_RESET;
+		} else {
+			/*
+			 * We didn't do PE reset for the case. The PE
+			 * is still in frozen state. Clear it before
+			 * resuming the PE.
+			 */
+			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+			result = PCI_ERS_RESULT_RECOVERED;
+		}
+	}
+
+	/* If any device has a hard failure, then shut off everything. */
+	if (result == PCI_ERS_RESULT_DISCONNECT) {
+		pr_warn("EEH: Device driver gave up\n");
+		goto hard_fail;
+	}
+
+	/* If any device called out for a reset, then reset the slot */
+	if (result == PCI_ERS_RESULT_NEED_RESET) {
+		pr_info("EEH: Reset without hotplug activity\n");
+		rc = eeh_reset_device(pe, bus, &rmv_data, true);
+		if (rc) {
+			pr_warn("%s: Cannot reset, err=%d\n",
+				__func__, rc);
+			goto hard_fail;
+		}
+
+		pr_info("EEH: Notify device drivers "
+			"the completion of reset\n");
+		result = PCI_ERS_RESULT_NONE;
+		eeh_set_channel_state(pe, pci_channel_io_normal);
+		eeh_set_irq_state(pe, true);
+		eeh_pe_report("slot_reset", pe, eeh_report_reset, &result);
+	}
+
+	/* All devices should claim they have recovered by now. */
+	if ((result != PCI_ERS_RESULT_RECOVERED) &&
+	    (result != PCI_ERS_RESULT_NONE)) {
+		pr_warn("EEH: Not recovered\n");
+		goto hard_fail;
+	}
+
+	/*
+	 * For those hot removed VFs, we should add back them after PF get
+	 * recovered properly.
+	 */
+	list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) {
+		eeh_add_virt_device(edev, NULL);
+		list_del(&edev->rmv_list);
+	}
+
+	/* Tell all device drivers that they can resume operations */
+	pr_info("EEH: Notify device driver to resume\n");
+	eeh_set_channel_state(pe, pci_channel_io_normal);
+	eeh_set_irq_state(pe, true);
+	eeh_pe_report("resume", pe, eeh_report_resume, NULL);
+	eeh_for_each_pe(pe, tmp_pe) {
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
+			edev->mode &= ~EEH_DEV_NO_HANDLER;
+			edev->in_error = false;
+		}
+	}
+
+	pr_info("EEH: Recovery successful.\n");
+	goto final;
+
+hard_fail:
+	/*
+	 * About 90% of all real-life EEH failures in the field
+	 * are due to poorly seated PCI cards. Only 10% or so are
+	 * due to actual, failed cards.
+	 */
+	pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
+	       "Please try reseating or replacing it\n",
+		pe->phb->global_number, pe->addr);
+
+	eeh_slot_error_detail(pe, EEH_LOG_PERM);
+
+	/* Notify all devices that they're about to go down. */
+	eeh_set_channel_state(pe, pci_channel_io_perm_failure);
+	eeh_set_irq_state(pe, false);
+	eeh_pe_report("error_detected(permanent failure)", pe,
+		      eeh_report_failure, NULL);
+
+	/* Mark the PE to be removed permanently */
+	eeh_pe_state_mark(pe, EEH_PE_REMOVED);
+
+	/*
+	 * Shut down the device drivers for good. We mark
+	 * all removed devices correctly to avoid access
+	 * the their PCI config any more.
+	 */
+	if (pe->type & EEH_PE_VF) {
+		eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+	} else {
+		eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+
+		pci_lock_rescan_remove();
+		pci_hp_remove_devices(bus);
+		pci_unlock_rescan_remove();
+		/* The passed PE should no longer be used */
+		return;
+	}
+final:
+	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+}
+
+/**
+ * eeh_handle_special_event - Handle EEH events without a specific failing PE
+ *
+ * Called when an EEH event is detected but can't be narrowed down to a
+ * specific PE.  Iterates through possible failures and handles them as
+ * necessary.
+ */
+void eeh_handle_special_event(void)
+{
+	struct eeh_pe *pe, *phb_pe, *tmp_pe;
+	struct eeh_dev *edev, *tmp_edev;
+	struct pci_bus *bus;
+	struct pci_controller *hose;
+	unsigned long flags;
+	int rc;
+
+
+	do {
+		rc = eeh_ops->next_error(&pe);
+
+		switch (rc) {
+		case EEH_NEXT_ERR_DEAD_IOC:
+			/* Mark all PHBs in dead state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events */
+			eeh_remove_event(NULL, true);
+
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe) continue;
+
+				eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+			}
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_FROZEN_PE:
+		case EEH_NEXT_ERR_FENCED_PHB:
+		case EEH_NEXT_ERR_DEAD_PHB:
+			/* Mark the PE in fenced state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events of the PHB */
+			eeh_remove_event(pe, true);
+
+			if (rc == EEH_NEXT_ERR_DEAD_PHB)
+				eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+			else
+				eeh_pe_state_mark(pe,
+					EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_NONE:
+			return;
+		default:
+			pr_warn("%s: Invalid value %d from next_error()\n",
+				__func__, rc);
+			return;
+		}
+
+		/*
+		 * For fenced PHB and frozen PE, it's handled as normal
+		 * event. We have to remove the affected PHBs for dead
+		 * PHB and IOC
+		 */
+		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
+		    rc == EEH_NEXT_ERR_FENCED_PHB) {
+			eeh_handle_normal_event(pe);
+		} else {
+			pci_lock_rescan_remove();
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe ||
+				    !(phb_pe->state & EEH_PE_ISOLATED) ||
+				    (phb_pe->state & EEH_PE_RECOVERING))
+					continue;
+
+				eeh_for_each_pe(pe, tmp_pe)
+					eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
+						edev->mode &= ~EEH_DEV_NO_HANDLER;
+
+				/* Notify all devices to be down */
+				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+				eeh_set_channel_state(pe, pci_channel_io_perm_failure);
+				eeh_pe_report(
+					"error_detected(permanent failure)", pe,
+					eeh_report_failure, NULL);
+				bus = eeh_pe_bus_get(phb_pe);
+				if (!bus) {
+					pr_err("%s: Cannot find PCI bus for "
+					       "PHB#%x-PE#%x\n",
+					       __func__,
+					       pe->phb->global_number,
+					       pe->addr);
+					break;
+				}
+				pci_hp_remove_devices(bus);
+			}
+			pci_unlock_rescan_remove();
+		}
+
+		/*
+		 * If we have detected dead IOC, we needn't proceed
+		 * any more since all PHBs would have been removed
+		 */
+		if (rc == EEH_NEXT_ERR_DEAD_IOC)
+			break;
+	} while (rc != EEH_NEXT_ERR_NONE);
+}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
new file mode 100644
index 000000000..61c9356bf
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -0,0 +1,194 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ *  EEH error states may be detected within exception handlers;
+ *  however, the recovery processing needs to occur asynchronously
+ *  in a normal kernel context and not an interrupt context.
+ *  This pair of routines creates an event and queues it onto a
+ *  work-queue, where a worker thread can drive recovery.
+ */
+
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+static struct semaphore eeh_eventlist_sem;
+static LIST_HEAD(eeh_eventlist);
+
+/**
+ * eeh_event_handler - Dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it.  The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+	struct eeh_pe *pe;
+
+	while (!kthread_should_stop()) {
+		if (down_interruptible(&eeh_eventlist_sem))
+			break;
+
+		/* Fetch EEH event from the queue */
+		spin_lock_irqsave(&eeh_eventlist_lock, flags);
+		event = NULL;
+		if (!list_empty(&eeh_eventlist)) {
+			event = list_entry(eeh_eventlist.next,
+					   struct eeh_event, list);
+			list_del(&event->list);
+		}
+		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+		if (!event)
+			continue;
+
+		/* We might have event without binding PE */
+		pe = event->pe;
+		if (pe) {
+			if (pe->type & EEH_PE_PHB)
+				pr_info("EEH: Detected error on PHB#%x\n",
+					 pe->phb->global_number);
+			else
+				pr_info("EEH: Detected PCI bus error on "
+					"PHB#%x-PE#%x\n",
+					pe->phb->global_number, pe->addr);
+			eeh_handle_normal_event(pe);
+		} else {
+			eeh_handle_special_event();
+		}
+
+		kfree(event);
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_event_init - Start kernel thread to handle EEH events
+ *
+ * This routine is called to start the kernel thread for processing
+ * EEH event.
+ */
+int eeh_event_init(void)
+{
+	struct task_struct *t;
+	int ret = 0;
+
+	/* Initialize semaphore */
+	sema_init(&eeh_eventlist_sem, 0);
+
+	t = kthread_run(eeh_event_handler, NULL, "eehd");
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		pr_err("%s: Failed to start EEH daemon (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_send_failure_event - Generate a PCI error event
+ * @pe: EEH PE
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event(struct eeh_pe *pe)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+	if (!event) {
+		pr_err("EEH: out of memory, event not handled\n");
+		return -ENOMEM;
+	}
+	event->pe = pe;
+
+	/* We may or may not be called in an interrupt context */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_add(&event->list, &eeh_eventlist);
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	/* For EEH deamon to knick in */
+	up(&eeh_eventlist_sem);
+
+	return 0;
+}
+
+/**
+ * eeh_remove_event - Remove EEH event from the queue
+ * @pe: Event binding to the PE
+ * @force: Event will be removed unconditionally
+ *
+ * On PowerNV platform, we might have subsequent coming events
+ * is part of the former one. For that case, those subsequent
+ * coming events are totally duplicated and unnecessary, thus
+ * they should be removed.
+ */
+void eeh_remove_event(struct eeh_pe *pe, bool force)
+{
+	unsigned long flags;
+	struct eeh_event *event, *tmp;
+
+	/*
+	 * If we have NULL PE passed in, we have dead IOC
+	 * or we're sure we can report all existing errors
+	 * by the caller.
+	 *
+	 * With "force", the event with associated PE that
+	 * have been isolated, the event won't be removed
+	 * to avoid event lost.
+	 */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
+		if (!force && event->pe &&
+		    (event->pe->state & EEH_PE_ISOLATED))
+			continue;
+
+		if (!pe) {
+			list_del(&event->list);
+			kfree(event);
+		} else if (pe->type & EEH_PE_PHB) {
+			if (event->pe && event->pe->phb == pe->phb) {
+				list_del(&event->list);
+				kfree(event);
+			}
+		} else if (event->pe == pe) {
+			list_del(&event->list);
+			kfree(event);
+		}
+	}
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
new file mode 100644
index 000000000..210d239a9
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -0,0 +1,954 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static int eeh_pe_aux_size = 0;
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_set_pe_aux_size - Set PE auxillary data size
+ * @size: PE auxillary data size
+ *
+ * Set PE auxillary data size
+ */
+void eeh_set_pe_aux_size(int size)
+{
+	if (size < 0)
+		return;
+
+	eeh_pe_aux_size = size;
+}
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+	struct eeh_pe *pe;
+	size_t alloc_size;
+
+	alloc_size = sizeof(struct eeh_pe);
+	if (eeh_pe_aux_size) {
+		alloc_size = ALIGN(alloc_size, cache_line_size());
+		alloc_size += eeh_pe_aux_size;
+	}
+
+	/* Allocate PHB PE */
+	pe = kzalloc(alloc_size, GFP_KERNEL);
+	if (!pe) return NULL;
+
+	/* Initialize PHB PE */
+	pe->type = type;
+	pe->phb = phb;
+	INIT_LIST_HEAD(&pe->child_list);
+	INIT_LIST_HEAD(&pe->child);
+	INIT_LIST_HEAD(&pe->edevs);
+
+	pe->data = (void *)pe + ALIGN(sizeof(struct eeh_pe),
+				      cache_line_size());
+	return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int eeh_phb_pe_create(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* Put it into the list */
+	list_add_tail(&pe->child, &eeh_phb_pe);
+
+	pr_debug("EEH: Add PE for PHB#%x\n", phb->global_number);
+
+	return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	list_for_each_entry(pe, &eeh_phb_pe, child) {
+		/*
+		 * Actually, we needn't check the type since
+		 * the PE for PHB has been determined when that
+		 * was created.
+		 */
+		if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
+			return pe;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root)
+{
+	struct list_head *next = pe->child_list.next;
+
+	if (next == &pe->child_list) {
+		while (1) {
+			if (pe == root)
+				return NULL;
+			next = pe->child.next;
+			if (next != &pe->parent->child_list)
+				break;
+			pe = pe->parent;
+		}
+	}
+
+	return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+void *eeh_pe_traverse(struct eeh_pe *root,
+		      eeh_pe_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	void *ret;
+
+	eeh_for_each_pe(root, pe) {
+		ret = fn(pe, flag);
+		if (ret) return ret;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+			  eeh_edev_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev, *tmp;
+	void *ret;
+
+	if (!root) {
+		pr_warn("%s: Invalid PE %p\n",
+			__func__, root);
+		return NULL;
+	}
+
+	/* Traverse root PE */
+	eeh_for_each_pe(root, pe) {
+		eeh_pe_for_each_dev(pe, edev, tmp) {
+			ret = fn(edev, flag);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+struct eeh_pe_get_flag {
+	int pe_no;
+	int config_addr;
+};
+
+static void *__eeh_pe_get(struct eeh_pe *pe, void *flag)
+{
+	struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag;
+
+	/* Unexpected PHB PE */
+	if (pe->type & EEH_PE_PHB)
+		return NULL;
+
+	/*
+	 * We prefer PE address. For most cases, we should
+	 * have non-zero PE address
+	 */
+	if (eeh_has_flag(EEH_VALID_PE_ZERO)) {
+		if (tmp->pe_no == pe->addr)
+			return pe;
+	} else {
+		if (tmp->pe_no &&
+		    (tmp->pe_no == pe->addr))
+			return pe;
+	}
+
+	/* Try BDF address */
+	if (tmp->config_addr &&
+	   (tmp->config_addr == pe->config_addr))
+		return pe;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @phb: PCI controller
+ * @pe_no: PE number
+ * @config_addr: Config address
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
+		int pe_no, int config_addr)
+{
+	struct eeh_pe *root = eeh_phb_pe_get(phb);
+	struct eeh_pe_get_flag tmp = { pe_no, config_addr };
+	struct eeh_pe *pe;
+
+	pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp);
+
+	return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+	struct eeh_dev *parent;
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	/*
+	 * It might have the case for the indirect parent
+	 * EEH device already having associated PE, but
+	 * the direct parent EEH device doesn't have yet.
+	 */
+	if (edev->physfn)
+		pdn = pci_get_pdn(edev->physfn);
+	else
+		pdn = pdn ? pdn->parent : NULL;
+	while (pdn) {
+		/* We're poking out of PCI territory */
+		parent = pdn_to_eeh_dev(pdn);
+		if (!parent)
+			return NULL;
+
+		if (parent->pe)
+			return parent->pe;
+
+		pdn = pdn->parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent;
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+	int config_addr = (pdn->busno << 8) | (pdn->devfn);
+
+	/* Check if the PE number is valid */
+	if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
+		pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
+		       __func__, config_addr, pdn->phb->global_number);
+		return -EINVAL;
+	}
+
+	/*
+	 * Search the PE has been existing or not according
+	 * to the PE address. If that has been existing, the
+	 * PE should be composed of PCI bus and its subordinate
+	 * components.
+	 */
+	pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
+	if (pe && !(pe->type & EEH_PE_INVALID)) {
+		/* Mark the PE as type of PCI bus */
+		pe->type = EEH_PE_BUS;
+		edev->pe = pe;
+
+		/* Put the edev to PE */
+		list_add_tail(&edev->list, &pe->edevs);
+		pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n",
+			 pdn->phb->global_number,
+			 pdn->busno,
+			 PCI_SLOT(pdn->devfn),
+			 PCI_FUNC(pdn->devfn),
+			 pe->addr);
+		return 0;
+	} else if (pe && (pe->type & EEH_PE_INVALID)) {
+		list_add_tail(&edev->list, &pe->edevs);
+		edev->pe = pe;
+		/*
+		 * We're running to here because of PCI hotplug caused by
+		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
+		 */
+		parent = pe;
+		while (parent) {
+			if (!(parent->type & EEH_PE_INVALID))
+				break;
+			parent->type &= ~EEH_PE_INVALID;
+			parent = parent->parent;
+		}
+
+		pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device "
+			 "PE#%x, Parent PE#%x\n",
+			 pdn->phb->global_number,
+			 pdn->busno,
+			 PCI_SLOT(pdn->devfn),
+			 PCI_FUNC(pdn->devfn),
+			 pe->addr, pe->parent->addr);
+		return 0;
+	}
+
+	/* Create a new EEH PE */
+	if (edev->physfn)
+		pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF);
+	else
+		pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+	pe->addr	= edev->pe_config_addr;
+	pe->config_addr	= config_addr;
+
+	/*
+	 * Put the new EEH PE into hierarchy tree. If the parent
+	 * can't be found, the newly created PE will be attached
+	 * to PHB directly. Otherwise, we have to associate the
+	 * PE with its parent.
+	 */
+	parent = eeh_pe_get_parent(edev);
+	if (!parent) {
+		parent = eeh_phb_pe_get(pdn->phb);
+		if (!parent) {
+			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+				__func__, pdn->phb->global_number);
+			edev->pe = NULL;
+			kfree(pe);
+			return -EEXIST;
+		}
+	}
+	pe->parent = parent;
+
+	/*
+	 * Put the newly created PE into the child list and
+	 * link the EEH device accordingly.
+	 */
+	list_add_tail(&pe->child, &parent->child_list);
+	list_add_tail(&edev->list, &pe->edevs);
+	edev->pe = pe;
+	pr_debug("EEH: Add %04x:%02x:%02x.%01x to "
+		 "Device PE#%x, Parent PE#%x\n",
+		 pdn->phb->global_number,
+		 pdn->busno,
+		 PCI_SLOT(pdn->devfn),
+		 PCI_FUNC(pdn->devfn),
+		 pe->addr, pe->parent->addr);
+
+	return 0;
+}
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent, *child;
+	int cnt;
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	if (!edev->pe) {
+		pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n",
+			 __func__,  pdn->phb->global_number,
+			 pdn->busno,
+			 PCI_SLOT(pdn->devfn),
+			 PCI_FUNC(pdn->devfn));
+		return -EEXIST;
+	}
+
+	/* Remove the EEH device */
+	pe = eeh_dev_to_pe(edev);
+	edev->pe = NULL;
+	list_del(&edev->list);
+
+	/*
+	 * Check if the parent PE includes any EEH devices.
+	 * If not, we should delete that. Also, we should
+	 * delete the parent PE if it doesn't have associated
+	 * child PEs and EEH devices.
+	 */
+	while (1) {
+		parent = pe->parent;
+		if (pe->type & EEH_PE_PHB)
+			break;
+
+		if (!(pe->state & EEH_PE_KEEP)) {
+			if (list_empty(&pe->edevs) &&
+			    list_empty(&pe->child_list)) {
+				list_del(&pe->child);
+				kfree(pe);
+			} else {
+				break;
+			}
+		} else {
+			if (list_empty(&pe->edevs)) {
+				cnt = 0;
+				list_for_each_entry(child, &pe->child_list, child) {
+					if (!(child->type & EEH_PE_INVALID)) {
+						cnt++;
+						break;
+					}
+				}
+
+				if (!cnt)
+					pe->type |= EEH_PE_INVALID;
+				else
+					break;
+			}
+		}
+
+		pe = parent;
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+	time64_t tstamp;
+
+	if (!pe) return;
+
+	if (pe->freeze_count <= 0) {
+		pe->freeze_count = 0;
+		pe->tstamp = ktime_get_seconds();
+	} else {
+		tstamp = ktime_get_seconds();
+		if (tstamp - pe->tstamp > 3600) {
+			pe->tstamp = tstamp;
+			pe->freeze_count = 0;
+		}
+	}
+}
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(struct eeh_pe *pe, void *flag)
+{
+	int state = *((int *)flag);
+	struct eeh_dev *edev, *tmp;
+	struct pci_dev *pdev;
+
+	/* Keep the state of permanently removed PE intact */
+	if (pe->state & EEH_PE_REMOVED)
+		return NULL;
+
+	pe->state |= state;
+
+	/* Offline PCI devices if applicable */
+	if (!(state & EEH_PE_ISOLATED))
+		return NULL;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev)
+			pdev->error_state = pci_channel_io_frozen;
+	}
+
+	/* Block PCI config access if required */
+	if (pe->state & EEH_PE_CFG_RESTRICTED)
+		pe->state |= EEH_PE_CFG_BLOCKED;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+}
+EXPORT_SYMBOL_GPL(eeh_pe_state_mark);
+
+static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag)
+{
+	int mode = *((int *)flag);
+
+	edev->mode |= mode;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_dev_state_mark - Mark state for all device under the PE
+ * @pe: EEH PE
+ *
+ * Mark specific state for all child devices of the PE.
+ */
+void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
+{
+	eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode);
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(struct eeh_pe *pe, void *flag)
+{
+	int state = *((int *)flag);
+	struct eeh_dev *edev, *tmp;
+	struct pci_dev *pdev;
+
+	/* Keep the state of permanently removed PE intact */
+	if (pe->state & EEH_PE_REMOVED)
+		return NULL;
+
+	pe->state &= ~state;
+
+	/*
+	 * Special treatment on clearing isolated state. Clear
+	 * check count since last isolation and put all affected
+	 * devices to normal state.
+	 */
+	if (!(state & EEH_PE_ISOLATED))
+		return NULL;
+
+	pe->check_count = 0;
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (!pdev)
+			continue;
+
+		pdev->error_state = pci_channel_io_normal;
+	}
+
+	/* Unblock PCI config access if required */
+	if (pe->state & EEH_PE_CFG_RESTRICTED)
+		pe->state &= ~EEH_PE_CFG_BLOCKED;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+}
+
+/**
+ * eeh_pe_state_mark_with_cfg - Mark PE state with unblocked config space
+ * @pe: PE
+ * @state: PE state to be set
+ *
+ * Set specified flag to PE and its child PEs. The PCI config space
+ * of some PEs is blocked automatically when EEH_PE_ISOLATED is set,
+ * which isn't needed in some situations. The function allows to set
+ * the specified flag to indicated PEs without blocking their PCI
+ * config space.
+ */
+void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+	if (!(state & EEH_PE_ISOLATED))
+		return;
+
+	/* Clear EEH_PE_CFG_BLOCKED, which might be set just now */
+	state = EEH_PE_CFG_BLOCKED;
+	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+}
+
+/*
+ * Some PCI bridges (e.g. PLX bridges) have primary/secondary
+ * buses assigned explicitly by firmware, and we probably have
+ * lost that after reset. So we have to delay the check until
+ * the PCI-CFG registers have been restored for the parent
+ * bridge.
+ *
+ * Don't use normal PCI-CFG accessors, which probably has been
+ * blocked on normal path during the stage. So we need utilize
+ * eeh operations, which is always permitted.
+ */
+static void eeh_bridge_check_link(struct eeh_dev *edev)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+	int cap;
+	uint32_t val;
+	int timeout = 0;
+
+	/*
+	 * We only check root port and downstream ports of
+	 * PCIe switches
+	 */
+	if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
+		return;
+
+	pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
+		 __func__, pdn->phb->global_number,
+		 pdn->busno,
+		 PCI_SLOT(pdn->devfn),
+		 PCI_FUNC(pdn->devfn));
+
+	/* Check slot status */
+	cap = edev->pcie_cap;
+	eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val);
+	if (!(val & PCI_EXP_SLTSTA_PDS)) {
+		pr_debug("  No card in the slot (0x%04x) !\n", val);
+		return;
+	}
+
+	/* Check power status if we have the capability */
+	eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCAP, 2, &val);
+	if (val & PCI_EXP_SLTCAP_PCP) {
+		eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val);
+		if (val & PCI_EXP_SLTCTL_PCC) {
+			pr_debug("  In power-off state, power it on ...\n");
+			val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
+			val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
+			eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val);
+			msleep(2 * 1000);
+		}
+	}
+
+	/* Enable link */
+	eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCTL, 2, &val);
+	val &= ~PCI_EXP_LNKCTL_LD;
+	eeh_ops->write_config(pdn, cap + PCI_EXP_LNKCTL, 2, val);
+
+	/* Check link */
+	eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val);
+	if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
+		pr_debug("  No link reporting capability (0x%08x) \n", val);
+		msleep(1000);
+		return;
+	}
+
+	/* Wait the link is up until timeout (5s) */
+	timeout = 0;
+	while (timeout < 5000) {
+		msleep(20);
+		timeout += 20;
+
+		eeh_ops->read_config(pdn, cap + PCI_EXP_LNKSTA, 2, &val);
+		if (val & PCI_EXP_LNKSTA_DLLLA)
+			break;
+	}
+
+	if (val & PCI_EXP_LNKSTA_DLLLA)
+		pr_debug("  Link up (%s)\n",
+			 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
+	else
+		pr_debug("  Link not ready (0x%04x)\n", val);
+}
+
+#define BYTE_SWAP(OFF)	(8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF)	(((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+static void eeh_restore_bridge_bars(struct eeh_dev *edev)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+	int i;
+
+	/*
+	 * Device BARs: 0x10 - 0x18
+	 * Bus numbers and windows: 0x18 - 0x30
+	 */
+	for (i = 4; i < 13; i++)
+		eeh_ops->write_config(pdn, i*4, 4, edev->config_space[i]);
+	/* Rom: 0x38 */
+	eeh_ops->write_config(pdn, 14*4, 4, edev->config_space[14]);
+
+	/* Cache line & Latency timer: 0xC 0xD */
+	eeh_ops->write_config(pdn, PCI_CACHE_LINE_SIZE, 1,
+                SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+        eeh_ops->write_config(pdn, PCI_LATENCY_TIMER, 1,
+                SAVED_BYTE(PCI_LATENCY_TIMER));
+	/* Max latency, min grant, interrupt ping and line: 0x3C */
+	eeh_ops->write_config(pdn, 15*4, 4, edev->config_space[15]);
+
+	/* PCI Command: 0x4 */
+	eeh_ops->write_config(pdn, PCI_COMMAND, 4, edev->config_space[1] |
+			      PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Check the PCIe link is ready */
+	eeh_bridge_check_link(edev);
+}
+
+static void eeh_restore_device_bars(struct eeh_dev *edev)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+	int i;
+	u32 cmd;
+
+	for (i = 4; i < 10; i++)
+		eeh_ops->write_config(pdn, i*4, 4, edev->config_space[i]);
+	/* 12 == Expansion ROM Address */
+	eeh_ops->write_config(pdn, 12*4, 4, edev->config_space[12]);
+
+	eeh_ops->write_config(pdn, PCI_CACHE_LINE_SIZE, 1,
+		SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	eeh_ops->write_config(pdn, PCI_LATENCY_TIMER, 1,
+		SAVED_BYTE(PCI_LATENCY_TIMER));
+
+	/* max latency, min grant, interrupt pin and line */
+	eeh_ops->write_config(pdn, 15*4, 4, edev->config_space[15]);
+
+	/*
+	 * Restore PERR & SERR bits, some devices require it,
+	 * don't touch the other command bits
+	 */
+	eeh_ops->read_config(pdn, PCI_COMMAND, 4, &cmd);
+	if (edev->config_space[1] & PCI_COMMAND_PARITY)
+		cmd |= PCI_COMMAND_PARITY;
+	else
+		cmd &= ~PCI_COMMAND_PARITY;
+	if (edev->config_space[1] & PCI_COMMAND_SERR)
+		cmd |= PCI_COMMAND_SERR;
+	else
+		cmd &= ~PCI_COMMAND_SERR;
+	eeh_ops->write_config(pdn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
+{
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	/* Do special restore for bridges */
+	if (edev->mode & EEH_DEV_BRIDGE)
+		eeh_restore_bridge_bars(edev);
+	else
+		eeh_restore_device_bars(edev);
+
+	if (eeh_ops->restore_config && pdn)
+		eeh_ops->restore_config(pdn);
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+	/*
+	 * We needn't take the EEH lock since eeh_pe_dev_traverse()
+	 * will take that.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
+/**
+ * eeh_pe_loc_get - Retrieve location code binding to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the location code of the given PE. If the primary PE bus
+ * is root bus, we will grab location code from PHB device tree node
+ * or root port. Otherwise, the upstream bridge's device tree node
+ * of the primary PE bus will be checked for the location code.
+ */
+const char *eeh_pe_loc_get(struct eeh_pe *pe)
+{
+	struct pci_bus *bus = eeh_pe_bus_get(pe);
+	struct device_node *dn;
+	const char *loc = NULL;
+
+	while (bus) {
+		dn = pci_bus_to_OF_node(bus);
+		if (!dn) {
+			bus = bus->parent;
+			continue;
+		}
+
+		if (pci_is_root_bus(bus))
+			loc = of_get_property(dn, "ibm,io-base-loc-code", NULL);
+		else
+			loc = of_get_property(dn, "ibm,slot-location-code",
+					      NULL);
+
+		if (loc)
+			return loc;
+
+		bus = bus->parent;
+	}
+
+	return "N/A";
+}
+
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+	struct eeh_dev *edev;
+	struct pci_dev *pdev;
+
+	if (pe->type & EEH_PE_PHB)
+		return pe->phb->bus;
+
+	/* The primary bus might be cached during probe time */
+	if (pe->state & EEH_PE_PRI_BUS)
+		return pe->bus;
+
+	/* Retrieve the parent PCI bus of first (top) PCI device */
+	edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list);
+	pdev = eeh_dev_to_pci_dev(edev);
+	if (pdev)
+		return pdev->bus;
+
+	return NULL;
+}
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
new file mode 100644
index 000000000..deed906dd
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -0,0 +1,196 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format)               \
+static ssize_t eeh_show_##_name(struct device *dev,      \
+		struct device_attribute *attr, char *buf)          \
+{                                                        \
+	struct pci_dev *pdev = to_pci_dev(dev);               \
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);      \
+	                                                      \
+	if (!edev)                                            \
+		return 0;                                     \
+	                                                      \
+	return sprintf(buf, _format "\n", edev->_memb);       \
+}                                                        \
+static DEVICE_ATTR(_name, 0444, eeh_show_##_name, NULL);
+
+EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
+
+static ssize_t eeh_pe_state_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+	int state;
+
+	if (!edev || !edev->pe)
+		return -ENODEV;
+
+	state = eeh_ops->get_state(edev->pe, NULL);
+	return sprintf(buf, "0x%08x 0x%08x\n",
+		       state, edev->pe->state);
+}
+
+static ssize_t eeh_pe_state_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+
+	if (!edev || !edev->pe)
+		return -ENODEV;
+
+	/* Nothing to do if it's not frozen */
+	if (!(edev->pe->state & EEH_PE_ISOLATED))
+		return count;
+
+	if (eeh_unfreeze_pe(edev->pe, true))
+		return -EIO;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(eeh_pe_state);
+
+#ifdef CONFIG_PCI_IOV
+static ssize_t eeh_notify_resume_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+
+	if (!edev || !edev->pe)
+		return -ENODEV;
+
+	pdn = pci_get_pdn(pdev);
+	return sprintf(buf, "%d\n", pdn->last_allow_rc);
+}
+
+static ssize_t eeh_notify_resume_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+
+	if (!edev || !edev->pe || !eeh_ops->notify_resume)
+		return -ENODEV;
+
+	if (eeh_ops->notify_resume(pci_get_pdn(pdev)))
+		return -EIO;
+
+	return count;
+}
+static DEVICE_ATTR_RW(eeh_notify_resume);
+
+static int eeh_notify_resume_add(struct pci_dev *pdev)
+{
+	struct device_node *np;
+	int rc = 0;
+
+	np = pci_device_to_OF_node(pdev->is_physfn ? pdev : pdev->physfn);
+
+	if (of_property_read_bool(np, "ibm,is-open-sriov-pf"))
+		rc = device_create_file(&pdev->dev, &dev_attr_eeh_notify_resume);
+
+	return rc;
+}
+
+static void eeh_notify_resume_remove(struct pci_dev *pdev)
+{
+	struct device_node *np;
+
+	np = pci_device_to_OF_node(pdev->is_physfn ? pdev : pdev->physfn);
+
+	if (of_property_read_bool(np, "ibm,is-open-sriov-pf"))
+		device_remove_file(&pdev->dev, &dev_attr_eeh_notify_resume);
+}
+#else
+static inline int eeh_notify_resume_add(struct pci_dev *pdev) { return 0; }
+static inline void eeh_notify_resume_remove(struct pci_dev *pdev) { }
+#endif /* CONFIG_PCI_IOV */
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+	int rc=0;
+
+	if (!eeh_enabled())
+		return;
+
+	if (edev && (edev->mode & EEH_DEV_SYSFS))
+		return;
+
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_state);
+	rc += eeh_notify_resume_add(pdev);
+
+	if (rc)
+		pr_warn("EEH: Unable to create sysfs entries\n");
+	else if (edev)
+		edev->mode |= EEH_DEV_SYSFS;
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+
+	/*
+	 * The parent directory might have been removed. We needn't
+	 * continue for that case.
+	 */
+	if (!pdev->dev.kobj.sd) {
+		if (edev)
+			edev->mode &= ~EEH_DEV_SYSFS;
+		return;
+	}
+
+	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state);
+
+	eeh_notify_resume_remove(pdev);
+
+	if (edev)
+		edev->mode &= ~EEH_DEV_SYSFS;
+}
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
new file mode 100644
index 000000000..26b3f853c
--- /dev/null
+++ b/arch/powerpc/kernel/entry_32.S
@@ -0,0 +1,1366 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Rewritten by Cort Dougan (cort@fsmlabs.com) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@fsmlabs.com>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *  MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
+ *
+ *  This file contains the system call entry code, context switch
+ *  code, and exception/interrupt return code for PowerPC.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/sys.h>
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+#include <asm/asm-405.h>
+#include <asm/feature-fixups.h>
+#include <asm/barrier.h>
+
+/*
+ * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE.
+ */
+#if MSR_KERNEL >= 0x10000
+#define LOAD_MSR_KERNEL(r, x)	lis r,(x)@h; ori r,r,(x)@l
+#else
+#define LOAD_MSR_KERNEL(r, x)	li r,(x)
+#endif
+
+/*
+ * Align to 4k in order to ensure that all functions modyfing srr0/srr1
+ * fit into one page in order to not encounter a TLB miss between the
+ * modification of srr0/srr1 and the associated rfi.
+ */
+	.align	12
+
+#ifdef CONFIG_BOOKE
+	.globl	mcheck_transfer_to_handler
+mcheck_transfer_to_handler:
+	mfspr	r0,SPRN_DSRR0
+	stw	r0,_DSRR0(r11)
+	mfspr	r0,SPRN_DSRR1
+	stw	r0,_DSRR1(r11)
+	/* fall through */
+
+	.globl	debug_transfer_to_handler
+debug_transfer_to_handler:
+	mfspr	r0,SPRN_CSRR0
+	stw	r0,_CSRR0(r11)
+	mfspr	r0,SPRN_CSRR1
+	stw	r0,_CSRR1(r11)
+	/* fall through */
+
+	.globl	crit_transfer_to_handler
+crit_transfer_to_handler:
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	mfspr	r0,SPRN_MAS0
+	stw	r0,MAS0(r11)
+	mfspr	r0,SPRN_MAS1
+	stw	r0,MAS1(r11)
+	mfspr	r0,SPRN_MAS2
+	stw	r0,MAS2(r11)
+	mfspr	r0,SPRN_MAS3
+	stw	r0,MAS3(r11)
+	mfspr	r0,SPRN_MAS6
+	stw	r0,MAS6(r11)
+#ifdef CONFIG_PHYS_64BIT
+	mfspr	r0,SPRN_MAS7
+	stw	r0,MAS7(r11)
+#endif /* CONFIG_PHYS_64BIT */
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+#ifdef CONFIG_44x
+	mfspr	r0,SPRN_MMUCR
+	stw	r0,MMUCR(r11)
+#endif
+	mfspr	r0,SPRN_SRR0
+	stw	r0,_SRR0(r11)
+	mfspr	r0,SPRN_SRR1
+	stw	r0,_SRR1(r11)
+
+	/* set the stack limit to the current stack
+	 * and set the limit to protect the thread_info
+	 * struct
+	 */
+	mfspr	r8,SPRN_SPRG_THREAD
+	lwz	r0,KSP_LIMIT(r8)
+	stw	r0,SAVED_KSP_LIMIT(r11)
+	rlwimi	r0,r1,0,0,(31-THREAD_SHIFT)
+	stw	r0,KSP_LIMIT(r8)
+	/* fall through */
+#endif
+
+#ifdef CONFIG_40x
+	.globl	crit_transfer_to_handler
+crit_transfer_to_handler:
+	lwz	r0,crit_r10@l(0)
+	stw	r0,GPR10(r11)
+	lwz	r0,crit_r11@l(0)
+	stw	r0,GPR11(r11)
+	mfspr	r0,SPRN_SRR0
+	stw	r0,crit_srr0@l(0)
+	mfspr	r0,SPRN_SRR1
+	stw	r0,crit_srr1@l(0)
+
+	/* set the stack limit to the current stack
+	 * and set the limit to protect the thread_info
+	 * struct
+	 */
+	mfspr	r8,SPRN_SPRG_THREAD
+	lwz	r0,KSP_LIMIT(r8)
+	stw	r0,saved_ksp_limit@l(0)
+	rlwimi	r0,r1,0,0,(31-THREAD_SHIFT)
+	stw	r0,KSP_LIMIT(r8)
+	/* fall through */
+#endif
+
+/*
+ * This code finishes saving the registers to the exception frame
+ * and jumps to the appropriate handler for the exception, turning
+ * on address translation.
+ * Note that we rely on the caller having set cr0.eq iff the exception
+ * occurred in kernel mode (i.e. MSR:PR = 0).
+ */
+	.globl	transfer_to_handler_full
+transfer_to_handler_full:
+	SAVE_NVGPRS(r11)
+	/* fall through */
+
+	.globl	transfer_to_handler
+transfer_to_handler:
+	stw	r2,GPR2(r11)
+	stw	r12,_NIP(r11)
+	stw	r9,_MSR(r11)
+	andi.	r2,r9,MSR_PR
+	mfctr	r12
+	mfspr	r2,SPRN_XER
+	stw	r12,_CTR(r11)
+	stw	r2,_XER(r11)
+	mfspr	r12,SPRN_SPRG_THREAD
+	addi	r2,r12,-THREAD
+	tovirt(r2,r2)			/* set r2 to current */
+	beq	2f			/* if from user, fix up THREAD.regs */
+	addi	r11,r1,STACK_FRAME_OVERHEAD
+	stw	r11,PT_REGS(r12)
+#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+	/* Check to see if the dbcr0 register is set up to debug.  Use the
+	   internal debug mode bit to do this. */
+	lwz	r12,THREAD_DBCR0(r12)
+	andis.	r12,r12,DBCR0_IDM@h
+	beq+	3f
+	/* From user and task is ptraced - load up global dbcr0 */
+	li	r12,-1			/* clear all pending debug events */
+	mtspr	SPRN_DBSR,r12
+	lis	r11,global_dbcr0@ha
+	tophys(r11,r11)
+	addi	r11,r11,global_dbcr0@l
+#ifdef CONFIG_SMP
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r9,TI_CPU(r9)
+	slwi	r9,r9,3
+	add	r11,r11,r9
+#endif
+	lwz	r12,0(r11)
+	mtspr	SPRN_DBCR0,r12
+	lwz	r12,4(r11)
+	addi	r12,r12,-1
+	stw	r12,4(r11)
+#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	CURRENT_THREAD_INFO(r9, r1)
+	tophys(r9, r9)
+	ACCOUNT_CPU_USER_ENTRY(r9, r11, r12)
+#endif
+
+	b	3f
+
+2:	/* if from kernel, check interrupted DOZE/NAP mode and
+         * check for stack overflow
+         */
+	lwz	r9,KSP_LIMIT(r12)
+	cmplw	r1,r9			/* if r1 <= ksp_limit */
+	ble-	stack_ovf		/* then the kernel stack overflowed */
+5:
+#if defined(CONFIG_6xx) || defined(CONFIG_E500)
+	CURRENT_THREAD_INFO(r9, r1)
+	tophys(r9,r9)			/* check local flags */
+	lwz	r12,TI_LOCAL_FLAGS(r9)
+	mtcrf	0x01,r12
+	bt-	31-TLF_NAPPING,4f
+	bt-	31-TLF_SLEEPING,7f
+#endif /* CONFIG_6xx || CONFIG_E500 */
+	.globl transfer_to_handler_cont
+transfer_to_handler_cont:
+3:
+	mflr	r9
+	lwz	r11,0(r9)		/* virtual address of handler */
+	lwz	r9,4(r9)		/* where to go when done */
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+	mtspr	SPRN_NRI, r0
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	lis	r12,reenable_mmu@h
+	ori	r12,r12,reenable_mmu@l
+	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r10
+	SYNC
+	RFI
+reenable_mmu:				/* re-enable mmu so we can */
+	mfmsr	r10
+	lwz	r12,_MSR(r1)
+	xor	r10,r10,r12
+	andi.	r10,r10,MSR_EE		/* Did EE change? */
+	beq	1f
+
+	/*
+	 * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
+	 * If from user mode there is only one stack frame on the stack, and
+	 * accessing CALLER_ADDR1 will cause oops. So we need create a dummy
+	 * stack frame to make trace_hardirqs_off happy.
+	 *
+	 * This is handy because we also need to save a bunch of GPRs,
+	 * r3 can be different from GPR3(r1) at this point, r9 and r11
+	 * contains the old MSR and handler address respectively,
+	 * r4 & r5 can contain page fault arguments that need to be passed
+	 * along as well. r12, CCR, CTR, XER etc... are left clobbered as
+	 * they aren't useful past this point (aren't syscall arguments),
+	 * the rest is restored from the exception frame.
+	 */
+	stwu	r1,-32(r1)
+	stw	r9,8(r1)
+	stw	r11,12(r1)
+	stw	r3,16(r1)
+	stw	r4,20(r1)
+	stw	r5,24(r1)
+	bl	trace_hardirqs_off
+	lwz	r5,24(r1)
+	lwz	r4,20(r1)
+	lwz	r3,16(r1)
+	lwz	r11,12(r1)
+	lwz	r9,8(r1)
+	addi	r1,r1,32
+	lwz	r0,GPR0(r1)
+	lwz	r6,GPR6(r1)
+	lwz	r7,GPR7(r1)
+	lwz	r8,GPR8(r1)
+1:	mtctr	r11
+	mtlr	r9
+	bctr				/* jump to handler */
+#else /* CONFIG_TRACE_IRQFLAGS */
+	mtspr	SPRN_SRR0,r11
+	mtspr	SPRN_SRR1,r10
+	mtlr	r9
+	SYNC
+	RFI				/* jump to handler, enable MMU */
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+#if defined (CONFIG_6xx) || defined(CONFIG_E500)
+4:	rlwinm	r12,r12,0,~_TLF_NAPPING
+	stw	r12,TI_LOCAL_FLAGS(r9)
+	b	power_save_ppc32_restore
+
+7:	rlwinm	r12,r12,0,~_TLF_SLEEPING
+	stw	r12,TI_LOCAL_FLAGS(r9)
+	lwz	r9,_MSR(r11)		/* if sleeping, clear MSR.EE */
+	rlwinm	r9,r9,0,~MSR_EE
+	lwz	r12,_LINK(r11)		/* and return to address in LR */
+	b	fast_exception_return
+#endif
+
+/*
+ * On kernel stack overflow, load up an initial stack pointer
+ * and call StackOverflow(regs), which should not return.
+ */
+stack_ovf:
+	/* sometimes we use a statically-allocated stack, which is OK. */
+	lis	r12,_end@h
+	ori	r12,r12,_end@l
+	cmplw	r1,r12
+	ble	5b			/* r1 <= &_end is OK */
+	SAVE_NVGPRS(r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	lis	r1,init_thread_union@ha
+	addi	r1,r1,init_thread_union@l
+	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	lis	r9,StackOverflow@ha
+	addi	r9,r9,StackOverflow@l
+	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+	mtspr	SPRN_NRI, r0
+#endif
+	mtspr	SPRN_SRR0,r9
+	mtspr	SPRN_SRR1,r10
+	SYNC
+	RFI
+
+/*
+ * Handle a system call.
+ */
+	.stabs	"arch/powerpc/kernel/",N_SO,0,0,0f
+	.stabs	"entry_32.S",N_SO,0,0,0f
+0:
+
+_GLOBAL(DoSyscall)
+	stw	r3,ORIG_GPR3(r1)
+	li	r12,0
+	stw	r12,RESULT(r1)
+	lwz	r11,_CCR(r1)	/* Clear SO bit in CR */
+	rlwinm	r11,r11,0,4,2
+	stw	r11,_CCR(r1)
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* Return from syscalls can (and generally will) hard enable
+	 * interrupts. You aren't supposed to call a syscall with
+	 * interrupts disabled in the first place. However, to ensure
+	 * that we get it right vs. lockdep if it happens, we force
+	 * that hard enable here with appropriate tracing if we see
+	 * that we have been called with interrupts off
+	 */
+	mfmsr	r11
+	andi.	r12,r11,MSR_EE
+	bne+	1f
+	/* We came in with interrupts disabled, we enable them now */
+	bl	trace_hardirqs_on
+	mfmsr	r11
+	lwz	r0,GPR0(r1)
+	lwz	r3,GPR3(r1)
+	lwz	r4,GPR4(r1)
+	ori	r11,r11,MSR_EE
+	lwz	r5,GPR5(r1)
+	lwz	r6,GPR6(r1)
+	lwz	r7,GPR7(r1)
+	lwz	r8,GPR8(r1)
+	mtmsr	r11
+1:
+#endif /* CONFIG_TRACE_IRQFLAGS */
+	CURRENT_THREAD_INFO(r10, r1)
+	lwz	r11,TI_FLAGS(r10)
+	andi.	r11,r11,_TIF_SYSCALL_DOTRACE
+	bne-	syscall_dotrace
+syscall_dotrace_cont:
+	cmplwi	0,r0,NR_syscalls
+	lis	r10,sys_call_table@h
+	ori	r10,r10,sys_call_table@l
+	slwi	r0,r0,2
+	bge-	66f
+
+	barrier_nospec_asm
+	/*
+	 * Prevent the load of the handler below (based on the user-passed
+	 * system call number) being speculatively executed until the test
+	 * against NR_syscalls and branch to .66f above has
+	 * committed.
+	 */
+
+	lwzx	r10,r10,r0	/* Fetch system call handler [ptr] */
+	mtlr	r10
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+	PPC440EP_ERR42
+	blrl			/* Call handler */
+	.globl	ret_from_syscall
+ret_from_syscall:
+#ifdef CONFIG_DEBUG_RSEQ
+	/* Check whether the syscall is issued inside a restartable sequence */
+	stw	r3,GPR3(r1)
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      rseq_syscall
+	lwz	r3,GPR3(r1)
+#endif
+	mr	r6,r3
+	CURRENT_THREAD_INFO(r12, r1)
+	/* disable interrupts so current_thread_info()->flags can't change */
+	LOAD_MSR_KERNEL(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
+	/* Note: We don't bother telling lockdep about it */
+	SYNC
+	MTMSRD(r10)
+	lwz	r9,TI_FLAGS(r12)
+	li	r8,-MAX_ERRNO
+	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
+	bne-	syscall_exit_work
+	cmplw	0,r3,r8
+	blt+	syscall_exit_cont
+	lwz	r11,_CCR(r1)			/* Load CR */
+	neg	r3,r3
+	oris	r11,r11,0x1000	/* Set SO bit in CR */
+	stw	r11,_CCR(r1)
+syscall_exit_cont:
+	lwz	r8,_MSR(r1)
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* If we are going to return from the syscall with interrupts
+	 * off, we trace that here. It shouldn't happen though but we
+	 * want to catch the bugger if it does right ?
+	 */
+	andi.	r10,r8,MSR_EE
+	bne+	1f
+	stw	r3,GPR3(r1)
+	bl      trace_hardirqs_off
+	lwz	r3,GPR3(r1)
+1:
+#endif /* CONFIG_TRACE_IRQFLAGS */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+	/* If the process has its own DBCR0 value, load it up.  The internal
+	   debug mode bit tells us that dbcr0 should be loaded. */
+	lwz	r0,THREAD+THREAD_DBCR0(r2)
+	andis.	r10,r0,DBCR0_IDM@h
+	bnel-	load_dbcr0
+#endif
+#ifdef CONFIG_44x
+BEGIN_MMU_FTR_SECTION
+	lis	r4,icache_44x_need_flush@ha
+	lwz	r5,icache_44x_need_flush@l(r4)
+	cmplwi	cr0,r5,0
+	bne-	2f
+1:
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_47x)
+#endif /* CONFIG_44x */
+BEGIN_FTR_SECTION
+	lwarx	r7,0,r1
+END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
+	stwcx.	r0,0,r1			/* to clear the reservation */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	andi.	r4,r8,MSR_PR
+	beq	3f
+	CURRENT_THREAD_INFO(r4, r1)
+	ACCOUNT_CPU_USER_EXIT(r4, r5, r7)
+3:
+#endif
+	lwz	r4,_LINK(r1)
+	lwz	r5,_CCR(r1)
+	mtlr	r4
+	mtcr	r5
+	lwz	r7,_NIP(r1)
+	lwz	r2,GPR2(r1)
+	lwz	r1,GPR1(r1)
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+	mtspr	SPRN_NRI, r0
+#endif
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r8
+	SYNC
+	RFI
+#ifdef CONFIG_44x
+2:	li	r7,0
+	iccci	r0,r0
+	stw	r7,icache_44x_need_flush@l(r4)
+	b	1b
+#endif  /* CONFIG_44x */
+
+66:	li	r3,-ENOSYS
+	b	ret_from_syscall
+
+	.globl	ret_from_fork
+ret_from_fork:
+	REST_NVGPRS(r1)
+	bl	schedule_tail
+	li	r3,0
+	b	ret_from_syscall
+
+	.globl	ret_from_kernel_thread
+ret_from_kernel_thread:
+	REST_NVGPRS(r1)
+	bl	schedule_tail
+	mtlr	r14
+	mr	r3,r15
+	PPC440EP_ERR42
+	blrl
+	li	r3,0
+	b	ret_from_syscall
+
+/* Traced system call support */
+syscall_dotrace:
+	SAVE_NVGPRS(r1)
+	li	r0,0xc00
+	stw	r0,_TRAP(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_syscall_trace_enter
+	/*
+	 * Restore argument registers possibly just changed.
+	 * We use the return value of do_syscall_trace_enter
+	 * for call number to look up in the table (r0).
+	 */
+	mr	r0,r3
+	lwz	r3,GPR3(r1)
+	lwz	r4,GPR4(r1)
+	lwz	r5,GPR5(r1)
+	lwz	r6,GPR6(r1)
+	lwz	r7,GPR7(r1)
+	lwz	r8,GPR8(r1)
+	REST_NVGPRS(r1)
+
+	cmplwi	r0,NR_syscalls
+	/* Return code is already in r3 thanks to do_syscall_trace_enter() */
+	bge-	ret_from_syscall
+	b	syscall_dotrace_cont
+
+syscall_exit_work:
+	andi.	r0,r9,_TIF_RESTOREALL
+	beq+	0f
+	REST_NVGPRS(r1)
+	b	2f
+0:	cmplw	0,r3,r8
+	blt+	1f
+	andi.	r0,r9,_TIF_NOERROR
+	bne-	1f
+	lwz	r11,_CCR(r1)			/* Load CR */
+	neg	r3,r3
+	oris	r11,r11,0x1000	/* Set SO bit in CR */
+	stw	r11,_CCR(r1)
+
+1:	stw	r6,RESULT(r1)	/* Save result */
+	stw	r3,GPR3(r1)	/* Update return value */
+2:	andi.	r0,r9,(_TIF_PERSYSCALL_MASK)
+	beq	4f
+
+	/* Clear per-syscall TIF flags if any are set.  */
+
+	li	r11,_TIF_PERSYSCALL_MASK
+	addi	r12,r12,TI_FLAGS
+3:	lwarx	r8,0,r12
+	andc	r8,r8,r11
+#ifdef CONFIG_IBM405_ERR77
+	dcbt	0,r12
+#endif
+	stwcx.	r8,0,r12
+	bne-	3b
+	subi	r12,r12,TI_FLAGS
+	
+4:	/* Anything which requires enabling interrupts? */
+	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
+	beq	ret_from_except
+
+	/* Re-enable interrupts. There is no need to trace that with
+	 * lockdep as we are supposed to have IRQs on at this point
+	 */
+	ori	r10,r10,MSR_EE
+	SYNC
+	MTMSRD(r10)
+
+	/* Save NVGPRS if they're not saved already */
+	lwz	r4,_TRAP(r1)
+	andi.	r4,r4,1
+	beq	5f
+	SAVE_NVGPRS(r1)
+	li	r4,0xc00
+	stw	r4,_TRAP(r1)
+5:
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_syscall_trace_leave
+	b	ret_from_except_full
+
+/*
+ * The fork/clone functions need to copy the full register set into
+ * the child process. Therefore we need to save all the nonvolatile
+ * registers (r13 - r31) before calling the C code.
+ */
+	.globl	ppc_fork
+ppc_fork:
+	SAVE_NVGPRS(r1)
+	lwz	r0,_TRAP(r1)
+	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
+	stw	r0,_TRAP(r1)		/* register set saved */
+	b	sys_fork
+
+	.globl	ppc_vfork
+ppc_vfork:
+	SAVE_NVGPRS(r1)
+	lwz	r0,_TRAP(r1)
+	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
+	stw	r0,_TRAP(r1)		/* register set saved */
+	b	sys_vfork
+
+	.globl	ppc_clone
+ppc_clone:
+	SAVE_NVGPRS(r1)
+	lwz	r0,_TRAP(r1)
+	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
+	stw	r0,_TRAP(r1)		/* register set saved */
+	b	sys_clone
+
+	.globl	ppc_swapcontext
+ppc_swapcontext:
+	SAVE_NVGPRS(r1)
+	lwz	r0,_TRAP(r1)
+	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
+	stw	r0,_TRAP(r1)		/* register set saved */
+	b	sys_swapcontext
+
+/*
+ * Top-level page fault handling.
+ * This is in assembler because if do_page_fault tells us that
+ * it is a bad kernel page fault, we want to save the non-volatile
+ * registers before calling bad_page_fault.
+ */
+	.globl	handle_page_fault
+handle_page_fault:
+	stw	r4,_DAR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_6xx
+	andis.  r0,r5,DSISR_DABRMATCH@h
+	bne-    handle_dabr_fault
+#endif
+	bl	do_page_fault
+	cmpwi	r3,0
+	beq+	ret_from_except
+	SAVE_NVGPRS(r1)
+	lwz	r0,_TRAP(r1)
+	clrrwi	r0,r0,1
+	stw	r0,_TRAP(r1)
+	mr	r5,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	lwz	r4,_DAR(r1)
+	bl	bad_page_fault
+	b	ret_from_except_full
+
+#ifdef CONFIG_6xx
+	/* We have a data breakpoint exception - handle it */
+handle_dabr_fault:
+	SAVE_NVGPRS(r1)
+	lwz	r0,_TRAP(r1)
+	clrrwi	r0,r0,1
+	stw	r0,_TRAP(r1)
+	bl      do_break
+	b	ret_from_except_full
+#endif
+
+/*
+ * This routine switches between two different tasks.  The process
+ * state of one is saved on its kernel stack.  Then the state
+ * of the other is restored from its kernel stack.  The memory
+ * management hardware is updated to the second process's state.
+ * Finally, we can return to the second process.
+ * On entry, r3 points to the THREAD for the current task, r4
+ * points to the THREAD for the new task.
+ *
+ * This routine is always called with interrupts disabled.
+ *
+ * Note: there are two ways to get to the "going out" portion
+ * of this code; either by coming in via the entry (_switch)
+ * or via "fork" which must set up an environment equivalent
+ * to the "_switch" path.  If you change this , you'll have to
+ * change the fork code also.
+ *
+ * The code which creates the new task context is in 'copy_thread'
+ * in arch/ppc/kernel/process.c
+ */
+_GLOBAL(_switch)
+	stwu	r1,-INT_FRAME_SIZE(r1)
+	mflr	r0
+	stw	r0,INT_FRAME_SIZE+4(r1)
+	/* r3-r12 are caller saved -- Cort */
+	SAVE_NVGPRS(r1)
+	stw	r0,_NIP(r1)	/* Return to switch caller */
+	mfmsr	r11
+	li	r0,MSR_FP	/* Disable floating-point */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r0,r0,MSR_VEC@h	/* Disable altivec */
+	mfspr	r12,SPRN_VRSAVE	/* save vrsave register value */
+	stw	r12,THREAD+THREAD_VRSAVE(r2)
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_SPE
+BEGIN_FTR_SECTION
+	oris	r0,r0,MSR_SPE@h	 /* Disable SPE */
+	mfspr	r12,SPRN_SPEFSCR /* save spefscr register value */
+	stw	r12,THREAD+THREAD_SPEFSCR(r2)
+END_FTR_SECTION_IFSET(CPU_FTR_SPE)
+#endif /* CONFIG_SPE */
+	and.	r0,r0,r11	/* FP or altivec or SPE enabled? */
+	beq+	1f
+	andc	r11,r11,r0
+	MTMSRD(r11)
+	isync
+1:	stw	r11,_MSR(r1)
+	mfcr	r10
+	stw	r10,_CCR(r1)
+	stw	r1,KSP(r3)	/* Set old stack pointer */
+
+#ifdef CONFIG_SMP
+	/* We need a sync somewhere here to make sure that if the
+	 * previous task gets rescheduled on another CPU, it sees all
+	 * stores it has performed on this one.
+	 */
+	sync
+#endif /* CONFIG_SMP */
+
+	tophys(r0,r4)
+	mtspr	SPRN_SPRG_THREAD,r0	/* Update current THREAD phys addr */
+	lwz	r1,KSP(r4)	/* Load new stack pointer */
+
+	/* save the old current 'last' for return value */
+	mr	r3,r2
+	addi	r2,r4,-THREAD	/* Update current */
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	lwz	r0,THREAD+THREAD_VRSAVE(r2)
+	mtspr	SPRN_VRSAVE,r0		/* if G4, restore VRSAVE reg */
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_SPE
+BEGIN_FTR_SECTION
+	lwz	r0,THREAD+THREAD_SPEFSCR(r2)
+	mtspr	SPRN_SPEFSCR,r0		/* restore SPEFSCR reg */
+END_FTR_SECTION_IFSET(CPU_FTR_SPE)
+#endif /* CONFIG_SPE */
+
+	lwz	r0,_CCR(r1)
+	mtcrf	0xFF,r0
+	/* r3-r12 are destroyed -- Cort */
+	REST_NVGPRS(r1)
+
+	lwz	r4,_NIP(r1)	/* Return to _switch caller in new task */
+	mtlr	r4
+	addi	r1,r1,INT_FRAME_SIZE
+	blr
+
+	.globl	fast_exception_return
+fast_exception_return:
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+	andi.	r10,r9,MSR_RI		/* check for recoverable interrupt */
+	beq	1f			/* if not, we've got problems */
+#endif
+
+2:	REST_4GPRS(3, r11)
+	lwz	r10,_CCR(r11)
+	REST_GPR(1, r11)
+	mtcr	r10
+	lwz	r10,_LINK(r11)
+	mtlr	r10
+	/* Clear the exception_marker on the stack to avoid confusing stacktrace */
+	li	r10, 0
+	stw	r10, 8(r11)
+	REST_GPR(10, r11)
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+	mtspr	SPRN_NRI, r0
+#endif
+	mtspr	SPRN_SRR1,r9
+	mtspr	SPRN_SRR0,r12
+	REST_GPR(9, r11)
+	REST_GPR(12, r11)
+	lwz	r11,GPR11(r11)
+	SYNC
+	RFI
+
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+/* check if the exception happened in a restartable section */
+1:	lis	r3,exc_exit_restart_end@ha
+	addi	r3,r3,exc_exit_restart_end@l
+	cmplw	r12,r3
+	bge	3f
+	lis	r4,exc_exit_restart@ha
+	addi	r4,r4,exc_exit_restart@l
+	cmplw	r12,r4
+	blt	3f
+	lis	r3,fee_restarts@ha
+	tophys(r3,r3)
+	lwz	r5,fee_restarts@l(r3)
+	addi	r5,r5,1
+	stw	r5,fee_restarts@l(r3)
+	mr	r12,r4		/* restart at exc_exit_restart */
+	b	2b
+
+	.section .bss
+	.align	2
+fee_restarts:
+	.space	4
+	.previous
+
+/* aargh, a nonrecoverable interrupt, panic */
+/* aargh, we don't know which trap this is */
+/* but the 601 doesn't implement the RI bit, so assume it's OK */
+3:
+BEGIN_FTR_SECTION
+	b	2b
+END_FTR_SECTION_IFSET(CPU_FTR_601)
+	li	r10,-1
+	stw	r10,_TRAP(r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	lis	r10,MSR_KERNEL@h
+	ori	r10,r10,MSR_KERNEL@l
+	bl	transfer_to_handler_full
+	.long	nonrecoverable_exception
+	.long	ret_from_except
+#endif
+
+	.globl	ret_from_except_full
+ret_from_except_full:
+	REST_NVGPRS(r1)
+	/* fall through */
+
+	.globl	ret_from_except
+ret_from_except:
+	/* Hard-disable interrupts so that current_thread_info()->flags
+	 * can't change between when we test it and when we return
+	 * from the interrupt. */
+	/* Note: We don't bother telling lockdep about it */
+	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	SYNC			/* Some chip revs have problems here... */
+	MTMSRD(r10)		/* disable interrupts */
+
+	lwz	r3,_MSR(r1)	/* Returning to user mode? */
+	andi.	r0,r3,MSR_PR
+	beq	resume_kernel
+
+user_exc_return:		/* r10 contains MSR_KERNEL here */
+	/* Check current_thread_info()->flags */
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r9,TI_FLAGS(r9)
+	andi.	r0,r9,_TIF_USER_WORK_MASK
+	bne	do_work
+
+restore_user:
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+	/* Check whether this process has its own DBCR0 value.  The internal
+	   debug mode bit tells us that dbcr0 should be loaded. */
+	lwz	r0,THREAD+THREAD_DBCR0(r2)
+	andis.	r10,r0,DBCR0_IDM@h
+	bnel-	load_dbcr0
+#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	CURRENT_THREAD_INFO(r9, r1)
+	ACCOUNT_CPU_USER_EXIT(r9, r10, r11)
+#endif
+
+	b	restore
+
+/* N.B. the only way to get here is from the beq following ret_from_except. */
+resume_kernel:
+	/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r8,TI_FLAGS(r9)
+	andis.	r0,r8,_TIF_EMULATE_STACK_STORE@h
+	beq+	1f
+
+	addi	r8,r1,INT_FRAME_SIZE	/* Get the kprobed function entry */
+
+	lwz	r3,GPR1(r1)
+	subi	r3,r3,INT_FRAME_SIZE	/* dst: Allocate a trampoline exception frame */
+	mr	r4,r1			/* src:  current exception frame */
+	mr	r1,r3			/* Reroute the trampoline frame to r1 */
+
+	/* Copy from the original to the trampoline. */
+	li	r5,INT_FRAME_SIZE/4	/* size: INT_FRAME_SIZE */
+	li	r6,0			/* start offset: 0 */
+	mtctr	r5
+2:	lwzx	r0,r6,r4
+	stwx	r0,r6,r3
+	addi	r6,r6,4
+	bdnz	2b
+
+	/* Do real store operation to complete stwu */
+	lwz	r5,GPR1(r1)
+	stw	r8,0(r5)
+
+	/* Clear _TIF_EMULATE_STACK_STORE flag */
+	lis	r11,_TIF_EMULATE_STACK_STORE@h
+	addi	r5,r9,TI_FLAGS
+0:	lwarx	r8,0,r5
+	andc	r8,r8,r11
+#ifdef CONFIG_IBM405_ERR77
+	dcbt	0,r5
+#endif
+	stwcx.	r8,0,r5
+	bne-	0b
+1:
+
+#ifdef CONFIG_PREEMPT
+	/* check current_thread_info->preempt_count */
+	lwz	r0,TI_PREEMPT(r9)
+	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
+	bne	restore
+	andi.	r8,r8,_TIF_NEED_RESCHED
+	beq+	restore
+	lwz	r3,_MSR(r1)
+	andi.	r0,r3,MSR_EE	/* interrupts off? */
+	beq	restore		/* don't schedule if so */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* Lockdep thinks irqs are enabled, we need to call
+	 * preempt_schedule_irq with IRQs off, so we inform lockdep
+	 * now that we -did- turn them off already
+	 */
+	bl	trace_hardirqs_off
+#endif
+1:	bl	preempt_schedule_irq
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r3,TI_FLAGS(r9)
+	andi.	r0,r3,_TIF_NEED_RESCHED
+	bne-	1b
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* And now, to properly rebalance the above, we tell lockdep they
+	 * are being turned back on, which will happen when we return
+	 */
+	bl	trace_hardirqs_on
+#endif
+#endif /* CONFIG_PREEMPT */
+
+	/* interrupts are hard-disabled at this point */
+restore:
+#ifdef CONFIG_44x
+BEGIN_MMU_FTR_SECTION
+	b	1f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+	lis	r4,icache_44x_need_flush@ha
+	lwz	r5,icache_44x_need_flush@l(r4)
+	cmplwi	cr0,r5,0
+	beq+	1f
+	li	r6,0
+	iccci	r0,r0
+	stw	r6,icache_44x_need_flush@l(r4)
+1:
+#endif  /* CONFIG_44x */
+
+	lwz	r9,_MSR(r1)
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* Lockdep doesn't know about the fact that IRQs are temporarily turned
+	 * off in this assembly code while peeking at TI_FLAGS() and such. However
+	 * we need to inform it if the exception turned interrupts off, and we
+	 * are about to trun them back on.
+	 *
+	 * The problem here sadly is that we don't know whether the exceptions was
+	 * one that turned interrupts off or not. So we always tell lockdep about
+	 * turning them on here when we go back to wherever we came from with EE
+	 * on, even if that may meen some redudant calls being tracked. Maybe later
+	 * we could encode what the exception did somewhere or test the exception
+	 * type in the pt_regs but that sounds overkill
+	 */
+	andi.	r10,r9,MSR_EE
+	beq	1f
+	/*
+	 * Since the ftrace irqsoff latency trace checks CALLER_ADDR1,
+	 * which is the stack frame here, we need to force a stack frame
+	 * in case we came from user space.
+	 */
+	stwu	r1,-32(r1)
+	mflr	r0
+	stw	r0,4(r1)
+	stwu	r1,-32(r1)
+	bl	trace_hardirqs_on
+	lwz	r1,0(r1)
+	lwz	r1,0(r1)
+	lwz	r9,_MSR(r1)
+1:
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+	lwz	r0,GPR0(r1)
+	lwz	r2,GPR2(r1)
+	REST_4GPRS(3, r1)
+	REST_2GPRS(7, r1)
+
+	lwz	r10,_XER(r1)
+	lwz	r11,_CTR(r1)
+	mtspr	SPRN_XER,r10
+	mtctr	r11
+
+	PPC405_ERR77(0,r1)
+BEGIN_FTR_SECTION
+	lwarx	r11,0,r1
+END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
+	stwcx.	r0,0,r1			/* to clear the reservation */
+
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+	andi.	r10,r9,MSR_RI		/* check if this exception occurred */
+	beql	nonrecoverable		/* at a bad place (MSR:RI = 0) */
+
+	lwz	r10,_CCR(r1)
+	lwz	r11,_LINK(r1)
+	mtcrf	0xFF,r10
+	mtlr	r11
+
+	/* Clear the exception_marker on the stack to avoid confusing stacktrace */
+	li	r10, 0
+	stw	r10, 8(r1)
+	/*
+	 * Once we put values in SRR0 and SRR1, we are in a state
+	 * where exceptions are not recoverable, since taking an
+	 * exception will trash SRR0 and SRR1.  Therefore we clear the
+	 * MSR:RI bit to indicate this.  If we do take an exception,
+	 * we can't return to the point of the exception but we
+	 * can restart the exception exit path at the label
+	 * exc_exit_restart below.  -- paulus
+	 */
+	LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI)
+	SYNC
+	MTMSRD(r10)		/* clear the RI bit */
+	.globl exc_exit_restart
+exc_exit_restart:
+	lwz	r12,_NIP(r1)
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+	mtspr	SPRN_NRI, r0
+#endif
+	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r9
+	REST_4GPRS(9, r1)
+	lwz	r1,GPR1(r1)
+	.globl exc_exit_restart_end
+exc_exit_restart_end:
+	SYNC
+	RFI
+
+#else /* !(CONFIG_4xx || CONFIG_BOOKE) */
+	/*
+	 * This is a bit different on 4xx/Book-E because it doesn't have
+	 * the RI bit in the MSR.
+	 * The TLB miss handler checks if we have interrupted
+	 * the exception exit path and restarts it if so
+	 * (well maybe one day it will... :).
+	 */
+	lwz	r11,_LINK(r1)
+	mtlr	r11
+	lwz	r10,_CCR(r1)
+	mtcrf	0xff,r10
+	/* Clear the exception_marker on the stack to avoid confusing stacktrace */
+	li	r10, 0
+	stw	r10, 8(r1)
+	REST_2GPRS(9, r1)
+	.globl exc_exit_restart
+exc_exit_restart:
+	lwz	r11,_NIP(r1)
+	lwz	r12,_MSR(r1)
+exc_exit_start:
+	mtspr	SPRN_SRR0,r11
+	mtspr	SPRN_SRR1,r12
+	REST_2GPRS(11, r1)
+	lwz	r1,GPR1(r1)
+	.globl exc_exit_restart_end
+exc_exit_restart_end:
+	PPC405_ERR77_SYNC
+	rfi
+	b	.			/* prevent prefetch past rfi */
+
+/*
+ * Returning from a critical interrupt in user mode doesn't need
+ * to be any different from a normal exception.  For a critical
+ * interrupt in the kernel, we just return (without checking for
+ * preemption) since the interrupt may have happened at some crucial
+ * place (e.g. inside the TLB miss handler), and because we will be
+ * running with r1 pointing into critical_stack, not the current
+ * process's kernel stack (and therefore current_thread_info() will
+ * give the wrong answer).
+ * We have to restore various SPRs that may have been in use at the
+ * time of the critical interrupt.
+ *
+ */
+#ifdef CONFIG_40x
+#define PPC_40x_TURN_OFF_MSR_DR						    \
+	/* avoid any possible TLB misses here by turning off MSR.DR, we	    \
+	 * assume the instructions here are mapped by a pinned TLB entry */ \
+	li	r10,MSR_IR;						    \
+	mtmsr	r10;							    \
+	isync;								    \
+	tophys(r1, r1);
+#else
+#define PPC_40x_TURN_OFF_MSR_DR
+#endif
+
+#define RET_FROM_EXC_LEVEL(exc_lvl_srr0, exc_lvl_srr1, exc_lvl_rfi)	\
+	REST_NVGPRS(r1);						\
+	lwz	r3,_MSR(r1);						\
+	andi.	r3,r3,MSR_PR;						\
+	LOAD_MSR_KERNEL(r10,MSR_KERNEL);				\
+	bne	user_exc_return;					\
+	lwz	r0,GPR0(r1);						\
+	lwz	r2,GPR2(r1);						\
+	REST_4GPRS(3, r1);						\
+	REST_2GPRS(7, r1);						\
+	lwz	r10,_XER(r1);						\
+	lwz	r11,_CTR(r1);						\
+	mtspr	SPRN_XER,r10;						\
+	mtctr	r11;							\
+	PPC405_ERR77(0,r1);						\
+	stwcx.	r0,0,r1;		/* to clear the reservation */	\
+	lwz	r11,_LINK(r1);						\
+	mtlr	r11;							\
+	lwz	r10,_CCR(r1);						\
+	mtcrf	0xff,r10;						\
+	PPC_40x_TURN_OFF_MSR_DR;					\
+	lwz	r9,_DEAR(r1);						\
+	lwz	r10,_ESR(r1);						\
+	mtspr	SPRN_DEAR,r9;						\
+	mtspr	SPRN_ESR,r10;						\
+	lwz	r11,_NIP(r1);						\
+	lwz	r12,_MSR(r1);						\
+	mtspr	exc_lvl_srr0,r11;					\
+	mtspr	exc_lvl_srr1,r12;					\
+	lwz	r9,GPR9(r1);						\
+	lwz	r12,GPR12(r1);						\
+	lwz	r10,GPR10(r1);						\
+	lwz	r11,GPR11(r1);						\
+	lwz	r1,GPR1(r1);						\
+	PPC405_ERR77_SYNC;						\
+	exc_lvl_rfi;							\
+	b	.;		/* prevent prefetch past exc_lvl_rfi */
+
+#define	RESTORE_xSRR(exc_lvl_srr0, exc_lvl_srr1)			\
+	lwz	r9,_##exc_lvl_srr0(r1);					\
+	lwz	r10,_##exc_lvl_srr1(r1);				\
+	mtspr	SPRN_##exc_lvl_srr0,r9;					\
+	mtspr	SPRN_##exc_lvl_srr1,r10;
+
+#if defined(CONFIG_PPC_BOOK3E_MMU)
+#ifdef CONFIG_PHYS_64BIT
+#define	RESTORE_MAS7							\
+	lwz	r11,MAS7(r1);						\
+	mtspr	SPRN_MAS7,r11;
+#else
+#define	RESTORE_MAS7
+#endif /* CONFIG_PHYS_64BIT */
+#define RESTORE_MMU_REGS						\
+	lwz	r9,MAS0(r1);						\
+	lwz	r10,MAS1(r1);						\
+	lwz	r11,MAS2(r1);						\
+	mtspr	SPRN_MAS0,r9;						\
+	lwz	r9,MAS3(r1);						\
+	mtspr	SPRN_MAS1,r10;						\
+	lwz	r10,MAS6(r1);						\
+	mtspr	SPRN_MAS2,r11;						\
+	mtspr	SPRN_MAS3,r9;						\
+	mtspr	SPRN_MAS6,r10;						\
+	RESTORE_MAS7;
+#elif defined(CONFIG_44x)
+#define RESTORE_MMU_REGS						\
+	lwz	r9,MMUCR(r1);						\
+	mtspr	SPRN_MMUCR,r9;
+#else
+#define RESTORE_MMU_REGS
+#endif
+
+#ifdef CONFIG_40x
+	.globl	ret_from_crit_exc
+ret_from_crit_exc:
+	mfspr	r9,SPRN_SPRG_THREAD
+	lis	r10,saved_ksp_limit@ha;
+	lwz	r10,saved_ksp_limit@l(r10);
+	tovirt(r9,r9);
+	stw	r10,KSP_LIMIT(r9)
+	lis	r9,crit_srr0@ha;
+	lwz	r9,crit_srr0@l(r9);
+	lis	r10,crit_srr1@ha;
+	lwz	r10,crit_srr1@l(r10);
+	mtspr	SPRN_SRR0,r9;
+	mtspr	SPRN_SRR1,r10;
+	RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI)
+#endif /* CONFIG_40x */
+
+#ifdef CONFIG_BOOKE
+	.globl	ret_from_crit_exc
+ret_from_crit_exc:
+	mfspr	r9,SPRN_SPRG_THREAD
+	lwz	r10,SAVED_KSP_LIMIT(r1)
+	stw	r10,KSP_LIMIT(r9)
+	RESTORE_xSRR(SRR0,SRR1);
+	RESTORE_MMU_REGS;
+	RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI)
+
+	.globl	ret_from_debug_exc
+ret_from_debug_exc:
+	mfspr	r9,SPRN_SPRG_THREAD
+	lwz	r10,SAVED_KSP_LIMIT(r1)
+	stw	r10,KSP_LIMIT(r9)
+	lwz	r9,THREAD_INFO-THREAD(r9)
+	CURRENT_THREAD_INFO(r10, r1)
+	lwz	r10,TI_PREEMPT(r10)
+	stw	r10,TI_PREEMPT(r9)
+	RESTORE_xSRR(SRR0,SRR1);
+	RESTORE_xSRR(CSRR0,CSRR1);
+	RESTORE_MMU_REGS;
+	RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, PPC_RFDI)
+
+	.globl	ret_from_mcheck_exc
+ret_from_mcheck_exc:
+	mfspr	r9,SPRN_SPRG_THREAD
+	lwz	r10,SAVED_KSP_LIMIT(r1)
+	stw	r10,KSP_LIMIT(r9)
+	RESTORE_xSRR(SRR0,SRR1);
+	RESTORE_xSRR(CSRR0,CSRR1);
+	RESTORE_xSRR(DSRR0,DSRR1);
+	RESTORE_MMU_REGS;
+	RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI)
+#endif /* CONFIG_BOOKE */
+
+/*
+ * Load the DBCR0 value for a task that is being ptraced,
+ * having first saved away the global DBCR0.  Note that r0
+ * has the dbcr0 value to set upon entry to this.
+ */
+load_dbcr0:
+	mfmsr	r10		/* first disable debug exceptions */
+	rlwinm	r10,r10,0,~MSR_DE
+	mtmsr	r10
+	isync
+	mfspr	r10,SPRN_DBCR0
+	lis	r11,global_dbcr0@ha
+	addi	r11,r11,global_dbcr0@l
+#ifdef CONFIG_SMP
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r9,TI_CPU(r9)
+	slwi	r9,r9,3
+	add	r11,r11,r9
+#endif
+	stw	r10,0(r11)
+	mtspr	SPRN_DBCR0,r0
+	lwz	r10,4(r11)
+	addi	r10,r10,1
+	stw	r10,4(r11)
+	li	r11,-1
+	mtspr	SPRN_DBSR,r11	/* clear all pending debug events */
+	blr
+
+	.section .bss
+	.align	4
+global_dbcr0:
+	.space	8*NR_CPUS
+	.previous
+#endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
+
+do_work:			/* r10 contains MSR_KERNEL here */
+	andi.	r0,r9,_TIF_NEED_RESCHED
+	beq	do_user_signal
+
+do_resched:			/* r10 contains MSR_KERNEL here */
+	/* Note: We don't need to inform lockdep that we are enabling
+	 * interrupts here. As far as it knows, they are already enabled
+	 */
+	ori	r10,r10,MSR_EE
+	SYNC
+	MTMSRD(r10)		/* hard-enable interrupts */
+	bl	schedule
+recheck:
+	/* Note: And we don't tell it we are disabling them again
+	 * neither. Those disable/enable cycles used to peek at
+	 * TI_FLAGS aren't advertised.
+	 */
+	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
+	SYNC
+	MTMSRD(r10)		/* disable interrupts */
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r9,TI_FLAGS(r9)
+	andi.	r0,r9,_TIF_NEED_RESCHED
+	bne-	do_resched
+	andi.	r0,r9,_TIF_USER_WORK_MASK
+	beq	restore_user
+do_user_signal:			/* r10 contains MSR_KERNEL here */
+	ori	r10,r10,MSR_EE
+	SYNC
+	MTMSRD(r10)		/* hard-enable interrupts */
+	/* save r13-r31 in the exception frame, if not already done */
+	lwz	r3,_TRAP(r1)
+	andi.	r0,r3,1
+	beq	2f
+	SAVE_NVGPRS(r1)
+	rlwinm	r3,r3,0,0,30
+	stw	r3,_TRAP(r1)
+2:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	mr	r4,r9
+	bl	do_notify_resume
+	REST_NVGPRS(r1)
+	b	recheck
+
+/*
+ * We come here when we are at the end of handling an exception
+ * that occurred at a place where taking an exception will lose
+ * state information, such as the contents of SRR0 and SRR1.
+ */
+nonrecoverable:
+	lis	r10,exc_exit_restart_end@ha
+	addi	r10,r10,exc_exit_restart_end@l
+	cmplw	r12,r10
+	bge	3f
+	lis	r11,exc_exit_restart@ha
+	addi	r11,r11,exc_exit_restart@l
+	cmplw	r12,r11
+	blt	3f
+	lis	r10,ee_restarts@ha
+	lwz	r12,ee_restarts@l(r10)
+	addi	r12,r12,1
+	stw	r12,ee_restarts@l(r10)
+	mr	r12,r11		/* restart at exc_exit_restart */
+	blr
+3:	/* OK, we can't recover, kill this process */
+	/* but the 601 doesn't implement the RI bit, so assume it's OK */
+BEGIN_FTR_SECTION
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_601)
+	lwz	r3,_TRAP(r1)
+	andi.	r0,r3,1
+	beq	4f
+	SAVE_NVGPRS(r1)
+	rlwinm	r3,r3,0,0,30
+	stw	r3,_TRAP(r1)
+4:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	nonrecoverable_exception
+	/* shouldn't return */
+	b	4b
+
+	.section .bss
+	.align	2
+ee_restarts:
+	.space	4
+	.previous
+
+/*
+ * PROM code for specific machines follows.  Put it
+ * here so it's easy to add arch-specific sections later.
+ * -- Cort
+ */
+#ifdef CONFIG_PPC_RTAS
+/*
+ * On CHRP, the Run-Time Abstraction Services (RTAS) have to be
+ * called with the MMU off.
+ */
+_GLOBAL(enter_rtas)
+	stwu	r1,-INT_FRAME_SIZE(r1)
+	mflr	r0
+	stw	r0,INT_FRAME_SIZE+4(r1)
+	LOAD_REG_ADDR(r4, rtas)
+	lis	r6,1f@ha	/* physical return address for rtas */
+	addi	r6,r6,1f@l
+	tophys(r6,r6)
+	tophys(r7,r1)
+	lwz	r8,RTASENTRY(r4)
+	lwz	r4,RTASBASE(r4)
+	mfmsr	r9
+	stw	r9,8(r1)
+	LOAD_MSR_KERNEL(r0,MSR_KERNEL)
+	SYNC			/* disable interrupts so SRR0/1 */
+	MTMSRD(r0)		/* don't get trashed */
+	li	r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
+	mtlr	r6
+	mtspr	SPRN_SPRG_RTAS,r7
+	mtspr	SPRN_SRR0,r8
+	mtspr	SPRN_SRR1,r9
+	RFI
+1:	tophys(r9,r1)
+	lwz	r8,INT_FRAME_SIZE+4(r9)	/* get return address */
+	lwz	r9,8(r9)	/* original msr value */
+	addi	r1,r1,INT_FRAME_SIZE
+	li	r0,0
+	mtspr	SPRN_SPRG_RTAS,r0
+	mtspr	SPRN_SRR0,r8
+	mtspr	SPRN_SRR1,r9
+	RFI			/* return to caller */
+
+	.globl	machine_check_in_rtas
+machine_check_in_rtas:
+	twi	31,0,0
+	/* XXX load up BATs and panic */
+
+#endif /* CONFIG_PPC_RTAS */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
new file mode 100644
index 000000000..58b50967b
--- /dev/null
+++ b/arch/powerpc/kernel/entry_64.S
@@ -0,0 +1,1343 @@
+/*
+ *  PowerPC version 
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *  MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
+ *
+ *  This file contains the system call entry code, context switch
+ *  code, and exception/interrupt return code for PowerPC.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <asm/unistd.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/thread_info.h>
+#include <asm/code-patching-asm.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/bug.h>
+#include <asm/ptrace.h>
+#include <asm/irqflags.h>
+#include <asm/hw_irq.h>
+#include <asm/context_tracking.h>
+#include <asm/tm.h>
+#include <asm/ppc-opcode.h>
+#include <asm/barrier.h>
+#include <asm/export.h>
+#include <asm/asm-compat.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/exception-64s.h>
+#else
+#include <asm/exception-64e.h>
+#endif
+#include <asm/feature-fixups.h>
+
+/*
+ * System calls.
+ */
+	.section	".toc","aw"
+SYS_CALL_TABLE:
+	.tc sys_call_table[TC],sys_call_table
+
+/* This value is used to mark exception frames on the stack. */
+exception_marker:
+	.tc	ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER
+
+	.section	".text"
+	.align 7
+
+	.globl system_call_common
+system_call_common:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	extrdi.	r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
+	bne	.Ltabort_syscall
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+	andi.	r10,r12,MSR_PR
+	mr	r10,r1
+	addi	r1,r1,-INT_FRAME_SIZE
+	beq-	1f
+	ld	r1,PACAKSAVE(r13)
+1:	std	r10,0(r1)
+	std	r11,_NIP(r1)
+	std	r12,_MSR(r1)
+	std	r0,GPR0(r1)
+	std	r10,GPR1(r1)
+	beq	2f			/* if from kernel mode */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+START_BTB_FLUSH_SECTION
+	BTB_FLUSH(r10)
+END_BTB_FLUSH_SECTION
+#endif
+	ACCOUNT_CPU_USER_ENTRY(r13, r10, r11)
+2:	std	r2,GPR2(r1)
+	std	r3,GPR3(r1)
+	mfcr	r2
+	std	r4,GPR4(r1)
+	std	r5,GPR5(r1)
+	std	r6,GPR6(r1)
+	std	r7,GPR7(r1)
+	std	r8,GPR8(r1)
+	li	r11,0
+	std	r11,GPR9(r1)
+	std	r11,GPR10(r1)
+	std	r11,GPR11(r1)
+	std	r11,GPR12(r1)
+	std	r11,_XER(r1)
+	std	r11,_CTR(r1)
+	std	r9,GPR13(r1)
+	mflr	r10
+	/*
+	 * This clears CR0.SO (bit 28), which is the error indication on
+	 * return from this system call.
+	 */
+	rldimi	r2,r11,28,(63-28)
+	li	r11,0xc01
+	std	r10,_LINK(r1)
+	std	r11,_TRAP(r1)
+	std	r3,ORIG_GPR3(r1)
+	std	r2,_CCR(r1)
+	ld	r2,PACATOC(r13)
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+	ld	r11,exception_marker@toc(r2)
+	std	r11,-16(r9)		/* "regshere" marker */
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
+BEGIN_FW_FTR_SECTION
+	beq	33f
+	/* if from user, see if there are any DTL entries to process */
+	ld	r10,PACALPPACAPTR(r13)	/* get ptr to VPA */
+	ld	r11,PACA_DTL_RIDX(r13)	/* get log read index */
+	addi	r10,r10,LPPACA_DTLIDX
+	LDX_BE	r10,0,r10		/* get log write index */
+	cmpd	cr1,r11,r10
+	beq+	cr1,33f
+	bl	accumulate_stolen_time
+	REST_GPR(0,r1)
+	REST_4GPRS(3,r1)
+	REST_2GPRS(7,r1)
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+33:
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */
+
+	/*
+	 * A syscall should always be called with interrupts enabled
+	 * so we just unconditionally hard-enable here. When some kind
+	 * of irq tracing is used, we additionally check that condition
+	 * is correct
+	 */
+#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG)
+	lbz	r10,PACAIRQSOFTMASK(r13)
+1:	tdnei	r10,IRQS_ENABLED
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
+#endif
+
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	1
+#else
+	li	r11,MSR_RI
+	ori	r11,r11,MSR_EE
+	mtmsrd	r11,1
+#endif /* CONFIG_PPC_BOOK3E */
+
+system_call:			/* label this so stack traces look sane */
+	/* We do need to set SOFTE in the stack frame or the return
+	 * from interrupt will be painful
+	 */
+	li	r10,IRQS_ENABLED
+	std	r10,SOFTE(r1)
+
+	CURRENT_THREAD_INFO(r11, r1)
+	ld	r10,TI_FLAGS(r11)
+	andi.	r11,r10,_TIF_SYSCALL_DOTRACE
+	bne	.Lsyscall_dotrace		/* does not return */
+	cmpldi	0,r0,NR_syscalls
+	bge-	.Lsyscall_enosys
+
+.Lsyscall:
+/*
+ * Need to vector to 32 Bit or default sys_call_table here,
+ * based on caller's run-mode / personality.
+ */
+	ld	r11,SYS_CALL_TABLE@toc(2)
+	andi.	r10,r10,_TIF_32BIT
+	beq	15f
+	addi	r11,r11,8	/* use 32-bit syscall entries */
+	clrldi	r3,r3,32
+	clrldi	r4,r4,32
+	clrldi	r5,r5,32
+	clrldi	r6,r6,32
+	clrldi	r7,r7,32
+	clrldi	r8,r8,32
+15:
+	slwi	r0,r0,4
+
+	barrier_nospec_asm
+	/*
+	 * Prevent the load of the handler below (based on the user-passed
+	 * system call number) being speculatively executed until the test
+	 * against NR_syscalls and branch to .Lsyscall_enosys above has
+	 * committed.
+	 */
+
+	ldx	r12,r11,r0	/* Fetch system call handler [ptr] */
+	mtctr   r12
+	bctrl			/* Call handler */
+
+.Lsyscall_exit:
+	std	r3,RESULT(r1)
+
+#ifdef CONFIG_DEBUG_RSEQ
+	/* Check whether the syscall is issued inside a restartable sequence */
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      rseq_syscall
+	ld	r3,RESULT(r1)
+#endif
+
+	CURRENT_THREAD_INFO(r12, r1)
+
+	ld	r8,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3S
+	/* No MSR:RI on BookE */
+	andi.	r10,r8,MSR_RI
+	beq-	.Lunrecov_restore
+#endif
+
+/*
+ * This is a few instructions into the actual syscall exit path (which actually
+ * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the
+ * number of visible symbols for profiling purposes.
+ *
+ * We can probe from system_call until this point as MSR_RI is set. But once it
+ * is cleared below, we won't be able to take a trap.
+ *
+ * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL().
+ */
+system_call_exit:
+	/*
+	 * Disable interrupts so current_thread_info()->flags can't change,
+	 * and so that we don't get interrupted after loading SRR0/1.
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	0
+#else
+	/*
+	 * For performance reasons we clear RI the same time that we
+	 * clear EE. We only need to clear RI just before we restore r13
+	 * below, but batching it with EE saves us one expensive mtmsrd call.
+	 * We have to be careful to restore RI if we branch anywhere from
+	 * here (eg syscall_exit_work).
+	 */
+	li	r11,0
+	mtmsrd	r11,1
+#endif /* CONFIG_PPC_BOOK3E */
+
+	ld	r9,TI_FLAGS(r12)
+	li	r11,-MAX_ERRNO
+	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
+	bne-	.Lsyscall_exit_work
+
+	andi.	r0,r8,MSR_FP
+	beq 2f
+#ifdef CONFIG_ALTIVEC
+	andis.	r0,r8,MSR_VEC@h
+	bne	3f
+#endif
+2:	addi    r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_BOOK3S
+	li	r10,MSR_RI
+	mtmsrd	r10,1		/* Restore RI */
+#endif
+	bl	restore_math
+#ifdef CONFIG_PPC_BOOK3S
+	li	r11,0
+	mtmsrd	r11,1
+#endif
+	ld	r8,_MSR(r1)
+	ld	r3,RESULT(r1)
+	li	r11,-MAX_ERRNO
+
+3:	cmpld	r3,r11
+	ld	r5,_CCR(r1)
+	bge-	.Lsyscall_error
+.Lsyscall_error_cont:
+	ld	r7,_NIP(r1)
+BEGIN_FTR_SECTION
+	stdcx.	r0,0,r1			/* to clear the reservation */
+END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+	andi.	r6,r8,MSR_PR
+	ld	r4,_LINK(r1)
+
+	beq-	1f
+	ACCOUNT_CPU_USER_EXIT(r13, r11, r12)
+
+BEGIN_FTR_SECTION
+	HMT_MEDIUM_LOW
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
+	ld	r2,GPR2(r1)
+	ld	r1,GPR1(r1)
+	mtlr	r4
+	mtcr	r5
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r8
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+
+	/* exit to kernel */
+1:	ld	r2,GPR2(r1)
+	ld	r1,GPR1(r1)
+	mtlr	r4
+	mtcr	r5
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r8
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+
+.Lsyscall_error:
+	oris	r5,r5,0x1000	/* Set SO bit in CR */
+	neg	r3,r3
+	std	r5,_CCR(r1)
+	b	.Lsyscall_error_cont
+
+/* Traced system call support */
+.Lsyscall_dotrace:
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_syscall_trace_enter
+
+	/*
+	 * We use the return value of do_syscall_trace_enter() as the syscall
+	 * number. If the syscall was rejected for any reason do_syscall_trace_enter()
+	 * returns an invalid syscall number and the test below against
+	 * NR_syscalls will fail.
+	 */
+	mr	r0,r3
+
+	/* Restore argument registers just clobbered and/or possibly changed. */
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r5,GPR5(r1)
+	ld	r6,GPR6(r1)
+	ld	r7,GPR7(r1)
+	ld	r8,GPR8(r1)
+
+	/* Repopulate r9 and r10 for the syscall path */
+	addi	r9,r1,STACK_FRAME_OVERHEAD
+	CURRENT_THREAD_INFO(r10, r1)
+	ld	r10,TI_FLAGS(r10)
+
+	cmpldi	r0,NR_syscalls
+	blt+	.Lsyscall
+
+	/* Return code is already in r3 thanks to do_syscall_trace_enter() */
+	b	.Lsyscall_exit
+
+
+.Lsyscall_enosys:
+	li	r3,-ENOSYS
+	b	.Lsyscall_exit
+	
+.Lsyscall_exit_work:
+#ifdef CONFIG_PPC_BOOK3S
+	li	r10,MSR_RI
+	mtmsrd	r10,1		/* Restore RI */
+#endif
+	/* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
+	 If TIF_NOERROR is set, just save r3 as it is. */
+
+	andi.	r0,r9,_TIF_RESTOREALL
+	beq+	0f
+	REST_NVGPRS(r1)
+	b	2f
+0:	cmpld	r3,r11		/* r11 is -MAX_ERRNO */
+	blt+	1f
+	andi.	r0,r9,_TIF_NOERROR
+	bne-	1f
+	ld	r5,_CCR(r1)
+	neg	r3,r3
+	oris	r5,r5,0x1000	/* Set SO bit in CR */
+	std	r5,_CCR(r1)
+1:	std	r3,GPR3(r1)
+2:	andi.	r0,r9,(_TIF_PERSYSCALL_MASK)
+	beq	4f
+
+	/* Clear per-syscall TIF flags if any are set.  */
+
+	li	r11,_TIF_PERSYSCALL_MASK
+	addi	r12,r12,TI_FLAGS
+3:	ldarx	r10,0,r12
+	andc	r10,r10,r11
+	stdcx.	r10,0,r12
+	bne-	3b
+	subi	r12,r12,TI_FLAGS
+
+4:	/* Anything else left to do? */
+BEGIN_FTR_SECTION
+	lis	r3,INIT_PPR@highest	/* Set thread.ppr = 3 */
+	ld	r10,PACACURRENT(r13)
+	sldi	r3,r3,32	/* bits 11-13 are used for ppr */
+	std	r3,TASKTHREADPPR(r10)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
+	beq	ret_from_except_lite
+
+	/* Re-enable interrupts */
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	1
+#else
+	li	r10,MSR_RI
+	ori	r10,r10,MSR_EE
+	mtmsrd	r10,1
+#endif /* CONFIG_PPC_BOOK3E */
+
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_syscall_trace_leave
+	b	ret_from_except
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+.Ltabort_syscall:
+	/* Firstly we need to enable TM in the kernel */
+	mfmsr	r10
+	li	r9, 1
+	rldimi	r10, r9, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r10, 0
+
+	/* tabort, this dooms the transaction, nothing else */
+	li	r9, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)
+	TABORT(R9)
+
+	/*
+	 * Return directly to userspace. We have corrupted user register state,
+	 * but userspace will never see that register state. Execution will
+	 * resume after the tbegin of the aborted transaction with the
+	 * checkpointed register state.
+	 */
+	li	r9, MSR_RI
+	andc	r10, r10, r9
+	mtmsrd	r10, 1
+	mtspr	SPRN_SRR0, r11
+	mtspr	SPRN_SRR1, r12
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+#endif
+_ASM_NOKPROBE_SYMBOL(system_call_common);
+_ASM_NOKPROBE_SYMBOL(system_call_exit);
+
+/* Save non-volatile GPRs, if not already saved. */
+_GLOBAL(save_nvgprs)
+	ld	r11,_TRAP(r1)
+	andi.	r0,r11,1
+	beqlr-
+	SAVE_NVGPRS(r1)
+	clrrdi	r0,r11,1
+	std	r0,_TRAP(r1)
+	blr
+_ASM_NOKPROBE_SYMBOL(save_nvgprs);
+
+	
+/*
+ * The sigsuspend and rt_sigsuspend system calls can call do_signal
+ * and thus put the process into the stopped state where we might
+ * want to examine its user state with ptrace.  Therefore we need
+ * to save all the nonvolatile registers (r14 - r31) before calling
+ * the C code.  Similarly, fork, vfork and clone need the full
+ * register state on the stack so that it can be copied to the child.
+ */
+
+_GLOBAL(ppc_fork)
+	bl	save_nvgprs
+	bl	sys_fork
+	b	.Lsyscall_exit
+
+_GLOBAL(ppc_vfork)
+	bl	save_nvgprs
+	bl	sys_vfork
+	b	.Lsyscall_exit
+
+_GLOBAL(ppc_clone)
+	bl	save_nvgprs
+	bl	sys_clone
+	b	.Lsyscall_exit
+
+_GLOBAL(ppc32_swapcontext)
+	bl	save_nvgprs
+	bl	compat_sys_swapcontext
+	b	.Lsyscall_exit
+
+_GLOBAL(ppc64_swapcontext)
+	bl	save_nvgprs
+	bl	sys_swapcontext
+	b	.Lsyscall_exit
+
+_GLOBAL(ppc_switch_endian)
+	bl	save_nvgprs
+	bl	sys_switch_endian
+	b	.Lsyscall_exit
+
+_GLOBAL(ret_from_fork)
+	bl	schedule_tail
+	REST_NVGPRS(r1)
+	li	r3,0
+	b	.Lsyscall_exit
+
+_GLOBAL(ret_from_kernel_thread)
+	bl	schedule_tail
+	REST_NVGPRS(r1)
+	mtlr	r14
+	mr	r3,r15
+#ifdef PPC64_ELF_ABI_v2
+	mr	r12,r14
+#endif
+	blrl
+	li	r3,0
+	b	.Lsyscall_exit
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define FLUSH_COUNT_CACHE	\
+1:	nop;			\
+	patch_site 1b, patch__call_flush_count_cache
+
+
+#define BCCTR_FLUSH	.long 0x4c400420
+
+.macro nops number
+	.rept \number
+	nop
+	.endr
+.endm
+
+.balign 32
+.global flush_count_cache
+flush_count_cache:
+	/* Save LR into r9 */
+	mflr	r9
+
+	// Flush the link stack
+	.rept 64
+	bl	.+4
+	.endr
+	b	1f
+	nops	6
+
+	.balign 32
+	/* Restore LR */
+1:	mtlr	r9
+
+	// If we're just flushing the link stack, return here
+3:	nop
+	patch_site 3b patch__flush_link_stack_return
+
+	li	r9,0x7fff
+	mtctr	r9
+
+	BCCTR_FLUSH
+
+2:	nop
+	patch_site 2b patch__flush_count_cache_return
+
+	nops	3
+
+	.rept 278
+	.balign 32
+	BCCTR_FLUSH
+	nops	7
+	.endr
+
+	blr
+#else
+#define FLUSH_COUNT_CACHE
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+/*
+ * This routine switches between two different tasks.  The process
+ * state of one is saved on its kernel stack.  Then the state
+ * of the other is restored from its kernel stack.  The memory
+ * management hardware is updated to the second process's state.
+ * Finally, we can return to the second process, via ret_from_except.
+ * On entry, r3 points to the THREAD for the current task, r4
+ * points to the THREAD for the new task.
+ *
+ * Note: there are two ways to get to the "going out" portion
+ * of this code; either by coming in via the entry (_switch)
+ * or via "fork" which must set up an environment equivalent
+ * to the "_switch" path.  If you change this you'll have to change
+ * the fork code also.
+ *
+ * The code which creates the new task context is in 'copy_thread'
+ * in arch/powerpc/kernel/process.c 
+ */
+	.align	7
+_GLOBAL(_switch)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-SWITCH_FRAME_SIZE(r1)
+	/* r3-r13 are caller saved -- Cort */
+	SAVE_8GPRS(14, r1)
+	SAVE_10GPRS(22, r1)
+	std	r0,_NIP(r1)	/* Return to switch caller */
+	mfcr	r23
+	std	r23,_CCR(r1)
+	std	r1,KSP(r3)	/* Set old stack pointer */
+
+	FLUSH_COUNT_CACHE
+
+	/*
+	 * On SMP kernels, care must be taken because a task may be
+	 * scheduled off CPUx and on to CPUy. Memory ordering must be
+	 * considered.
+	 *
+	 * Cacheable stores on CPUx will be visible when the task is
+	 * scheduled on CPUy by virtue of the core scheduler barriers
+	 * (see "Notes on Program-Order guarantees on SMP systems." in
+	 * kernel/sched/core.c).
+	 *
+	 * Uncacheable stores in the case of involuntary preemption must
+	 * be taken care of. The smp_mb__before_spin_lock() in __schedule()
+	 * is implemented as hwsync on powerpc, which orders MMIO too. So
+	 * long as there is an hwsync in the context switch path, it will
+	 * be executed on the source CPU after the task has performed
+	 * all MMIO ops on that CPU, and on the destination CPU before the
+	 * task performs any MMIO ops there.
+	 */
+
+	/*
+	 * The kernel context switch path must contain a spin_lock,
+	 * which contains larx/stcx, which will clear any reservation
+	 * of the task being switched.
+	 */
+#ifdef CONFIG_PPC_BOOK3S
+/* Cancel all explict user streams as they will have no use after context
+ * switch and will stop the HW from creating streams itself
+ */
+	DCBT_BOOK3S_STOP_ALL_STREAM_IDS(r6)
+#endif
+
+	addi	r6,r4,-THREAD	/* Convert THREAD to 'current' */
+	std	r6,PACACURRENT(r13)	/* Set new 'current' */
+
+	ld	r8,KSP(r4)	/* new stack pointer */
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_MMU_FTR_SECTION
+	b	2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
+BEGIN_FTR_SECTION
+	clrrdi	r6,r8,28	/* get its ESID */
+	clrrdi	r9,r1,28	/* get current sp ESID */
+FTR_SECTION_ELSE
+	clrrdi	r6,r8,40	/* get its 1T ESID */
+	clrrdi	r9,r1,40	/* get current sp 1T ESID */
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_1T_SEGMENT)
+	clrldi.	r0,r6,2		/* is new ESID c00000000? */
+	cmpd	cr1,r6,r9	/* or is new ESID the same as current ESID? */
+	cror	eq,4*cr1+eq,eq
+	beq	2f		/* if yes, don't slbie it */
+
+	/* Bolt in the new stack SLB entry */
+	ld	r7,KSP_VSID(r4)	/* Get new stack's VSID */
+	oris	r0,r6,(SLB_ESID_V)@h
+	ori	r0,r0,(SLB_NUM_BOLTED-1)@l
+BEGIN_FTR_SECTION
+	li	r9,MMU_SEGSIZE_1T	/* insert B field */
+	oris	r6,r6,(MMU_SEGSIZE_1T << SLBIE_SSIZE_SHIFT)@h
+	rldimi	r7,r9,SLB_VSID_SSIZE_SHIFT,0
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
+
+	/* Update the last bolted SLB.  No write barriers are needed
+	 * here, provided we only update the current CPU's SLB shadow
+	 * buffer.
+	 */
+	ld	r9,PACA_SLBSHADOWPTR(r13)
+	li	r12,0
+	std	r12,SLBSHADOW_STACKESID(r9)	/* Clear ESID */
+	li	r12,SLBSHADOW_STACKVSID
+	STDX_BE	r7,r12,r9			/* Save VSID */
+	li	r12,SLBSHADOW_STACKESID
+	STDX_BE	r0,r12,r9			/* Save ESID */
+
+	/* No need to check for MMU_FTR_NO_SLBIE_B here, since when
+	 * we have 1TB segments, the only CPUs known to have the errata
+	 * only support less than 1TB of system memory and we'll never
+	 * actually hit this code path.
+	 */
+
+	isync
+	slbie	r6
+	slbie	r6		/* Workaround POWER5 < DD2.1 issue */
+	slbmte	r7,r0
+	isync
+2:
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	CURRENT_THREAD_INFO(r7, r8)  /* base of new stack */
+	/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
+	   because we don't need to leave the 288-byte ABI gap at the
+	   top of the kernel stack. */
+	addi	r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE
+
+	/*
+	 * PMU interrupts in radix may come in here. They will use r1, not
+	 * PACAKSAVE, so this stack switch will not cause a problem. They
+	 * will store to the process stack, which may then be migrated to
+	 * another CPU. However the rq lock release on this CPU paired with
+	 * the rq lock acquire on the new CPU before the stack becomes
+	 * active on the new CPU, will order those stores.
+	 */
+	mr	r1,r8		/* start using new stack pointer */
+	std	r7,PACAKSAVE(r13)
+
+	ld	r6,_CCR(r1)
+	mtcrf	0xFF,r6
+
+	/* r3-r13 are destroyed -- Cort */
+	REST_8GPRS(14, r1)
+	REST_10GPRS(22, r1)
+
+	/* convert old thread to its task_struct for return value */
+	addi	r3,r3,-THREAD
+	ld	r7,_NIP(r1)	/* Return to _switch caller in new task */
+	mtlr	r7
+	addi	r1,r1,SWITCH_FRAME_SIZE
+	blr
+
+	.align	7
+_GLOBAL(ret_from_except)
+	ld	r11,_TRAP(r1)
+	andi.	r0,r11,1
+	bne	ret_from_except_lite
+	REST_NVGPRS(r1)
+
+_GLOBAL(ret_from_except_lite)
+	/*
+	 * Disable interrupts so that current_thread_info()->flags
+	 * can't change between when we test it and when we return
+	 * from the interrupt.
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	0
+#else
+	li	r10,MSR_RI
+	mtmsrd	r10,1		  /* Update machine state */
+#endif /* CONFIG_PPC_BOOK3E */
+
+	CURRENT_THREAD_INFO(r9, r1)
+	ld	r3,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3E
+	ld	r10,PACACURRENT(r13)
+#endif /* CONFIG_PPC_BOOK3E */
+	ld	r4,TI_FLAGS(r9)
+	andi.	r3,r3,MSR_PR
+	beq	resume_kernel
+#ifdef CONFIG_PPC_BOOK3E
+	lwz	r3,(THREAD+THREAD_DBCR0)(r10)
+#endif /* CONFIG_PPC_BOOK3E */
+
+	/* Check current_thread_info()->flags */
+	andi.	r0,r4,_TIF_USER_WORK_MASK
+	bne	1f
+#ifdef CONFIG_PPC_BOOK3E
+	/*
+	 * Check to see if the dbcr0 register is set up to debug.
+	 * Use the internal debug mode bit to do this.
+	 */
+	andis.	r0,r3,DBCR0_IDM@h
+	beq	restore
+	mfmsr	r0
+	rlwinm	r0,r0,0,~MSR_DE	/* Clear MSR.DE */
+	mtmsr	r0
+	mtspr	SPRN_DBCR0,r3
+	li	r10, -1
+	mtspr	SPRN_DBSR,r10
+	b	restore
+#else
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	restore_math
+	b	restore
+#endif
+1:	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq	2f
+	bl	restore_interrupts
+	SCHEDULE_USER
+	b	ret_from_except_lite
+2:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	andi.	r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
+	bne	3f		/* only restore TM if nothing else to do */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	restore_tm_state
+	b	restore
+3:
+#endif
+	bl	save_nvgprs
+	/*
+	 * Use a non volatile GPR to save and restore our thread_info flags
+	 * across the call to restore_interrupts.
+	 */
+	mr	r30,r4
+	bl	restore_interrupts
+	mr	r4,r30
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_notify_resume
+	b	ret_from_except
+
+resume_kernel:
+	/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
+	andis.	r8,r4,_TIF_EMULATE_STACK_STORE@h
+	beq+	1f
+
+	addi	r8,r1,INT_FRAME_SIZE	/* Get the kprobed function entry */
+
+	ld	r3,GPR1(r1)
+	subi	r3,r3,INT_FRAME_SIZE	/* dst: Allocate a trampoline exception frame */
+	mr	r4,r1			/* src:  current exception frame */
+	mr	r1,r3			/* Reroute the trampoline frame to r1 */
+
+	/* Copy from the original to the trampoline. */
+	li	r5,INT_FRAME_SIZE/8	/* size: INT_FRAME_SIZE */
+	li	r6,0			/* start offset: 0 */
+	mtctr	r5
+2:	ldx	r0,r6,r4
+	stdx	r0,r6,r3
+	addi	r6,r6,8
+	bdnz	2b
+
+	/* Do real store operation to complete stdu */
+	ld	r5,GPR1(r1)
+	std	r8,0(r5)
+
+	/* Clear _TIF_EMULATE_STACK_STORE flag */
+	lis	r11,_TIF_EMULATE_STACK_STORE@h
+	addi	r5,r9,TI_FLAGS
+0:	ldarx	r4,0,r5
+	andc	r4,r4,r11
+	stdcx.	r4,0,r5
+	bne-	0b
+1:
+
+#ifdef CONFIG_PREEMPT
+	/* Check if we need to preempt */
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq+	restore
+	/* Check that preempt_count() == 0 and interrupts are enabled */
+	lwz	r8,TI_PREEMPT(r9)
+	cmpwi	cr0,r8,0
+	bne	restore
+	ld	r0,SOFTE(r1)
+	andi.	r0,r0,IRQS_DISABLED
+	bne	restore
+
+	/*
+	 * Here we are preempting the current task. We want to make
+	 * sure we are soft-disabled first and reconcile irq state.
+	 */
+	RECONCILE_IRQ_STATE(r3,r4)
+1:	bl	preempt_schedule_irq
+
+	/* Re-test flags and eventually loop */
+	CURRENT_THREAD_INFO(r9, r1)
+	ld	r4,TI_FLAGS(r9)
+	andi.	r0,r4,_TIF_NEED_RESCHED
+	bne	1b
+
+	/*
+	 * arch_local_irq_restore() from preempt_schedule_irq above may
+	 * enable hard interrupt but we really should disable interrupts
+	 * when we return from the interrupt, and so that we don't get
+	 * interrupted after loading SRR0/1.
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	0
+#else
+	li	r10,MSR_RI
+	mtmsrd	r10,1		  /* Update machine state */
+#endif /* CONFIG_PPC_BOOK3E */
+#endif /* CONFIG_PREEMPT */
+
+	.globl	fast_exc_return_irq
+fast_exc_return_irq:
+restore:
+	/*
+	 * This is the main kernel exit path. First we check if we
+	 * are about to re-enable interrupts
+	 */
+	ld	r5,SOFTE(r1)
+	lbz	r6,PACAIRQSOFTMASK(r13)
+	andi.	r5,r5,IRQS_DISABLED
+	bne	.Lrestore_irq_off
+
+	/* We are enabling, were we already enabled ? Yes, just return */
+	andi.	r6,r6,IRQS_DISABLED
+	beq	cr0,.Ldo_restore
+
+	/*
+	 * We are about to soft-enable interrupts (we are hard disabled
+	 * at this point). We check if there's anything that needs to
+	 * be replayed first.
+	 */
+	lbz	r0,PACAIRQHAPPENED(r13)
+	cmpwi	cr0,r0,0
+	bne-	.Lrestore_check_irq_replay
+
+	/*
+	 * Get here when nothing happened while soft-disabled, just
+	 * soft-enable and move-on. We will hard-enable as a side
+	 * effect of rfi
+	 */
+.Lrestore_no_replay:
+	TRACE_ENABLE_INTS
+	li	r0,IRQS_ENABLED
+	stb	r0,PACAIRQSOFTMASK(r13);
+
+	/*
+	 * Final return path. BookE is handled in a different file
+	 */
+.Ldo_restore:
+#ifdef CONFIG_PPC_BOOK3E
+	b	exception_return_book3e
+#else
+	/*
+	 * Clear the reservation. If we know the CPU tracks the address of
+	 * the reservation then we can potentially save some cycles and use
+	 * a larx. On POWER6 and POWER7 this is significantly faster.
+	 */
+BEGIN_FTR_SECTION
+	stdcx.	r0,0,r1		/* to clear the reservation */
+FTR_SECTION_ELSE
+	ldarx	r4,0,r1
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+
+	/*
+	 * Some code path such as load_up_fpu or altivec return directly
+	 * here. They run entirely hard disabled and do not alter the
+	 * interrupt state. They also don't use lwarx/stwcx. and thus
+	 * are known not to leave dangling reservations.
+	 */
+	.globl	fast_exception_return
+fast_exception_return:
+	ld	r3,_MSR(r1)
+	ld	r4,_CTR(r1)
+	ld	r0,_LINK(r1)
+	mtctr	r4
+	mtlr	r0
+	ld	r4,_XER(r1)
+	mtspr	SPRN_XER,r4
+
+	REST_8GPRS(5, r1)
+
+	andi.	r0,r3,MSR_RI
+	beq-	.Lunrecov_restore
+
+	/* Load PPR from thread struct before we clear MSR:RI */
+BEGIN_FTR_SECTION
+	ld	r2,PACACURRENT(r13)
+	ld	r2,TASKTHREADPPR(r2)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	/*
+	 * Clear RI before restoring r13.  If we are returning to
+	 * userspace and we take an exception after restoring r13,
+	 * we end up corrupting the userspace r13 value.
+	 */
+	li	r4,0
+	mtmsrd	r4,1
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* TM debug */
+	std	r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */
+#endif
+	/*
+	 * r13 is our per cpu area, only restore it if we are returning to
+	 * userspace the value stored in the stack frame may belong to
+	 * another CPU.
+	 */
+	andi.	r0,r3,MSR_PR
+	beq	1f
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PPR,r2	/* Restore PPR */
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
+	REST_GPR(13, r1)
+
+	mtspr	SPRN_SRR1,r3
+
+	ld	r2,_CCR(r1)
+	mtcrf	0xFF,r2
+	ld	r2,_NIP(r1)
+	mtspr	SPRN_SRR0,r2
+
+	ld	r0,GPR0(r1)
+	ld	r2,GPR2(r1)
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r1,GPR1(r1)
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+
+1:	mtspr	SPRN_SRR1,r3
+
+	ld	r2,_CCR(r1)
+	mtcrf	0xFF,r2
+	ld	r2,_NIP(r1)
+	mtspr	SPRN_SRR0,r2
+
+	/*
+	 * Leaving a stale exception_marker on the stack can confuse
+	 * the reliable stack unwinder later on. Clear it.
+	 */
+	li	r2,0
+	std	r2,STACK_FRAME_OVERHEAD-16(r1)
+
+	ld	r0,GPR0(r1)
+	ld	r2,GPR2(r1)
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r1,GPR1(r1)
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+
+#endif /* CONFIG_PPC_BOOK3E */
+
+	/*
+	 * We are returning to a context with interrupts soft disabled.
+	 *
+	 * However, we may also about to hard enable, so we need to
+	 * make sure that in this case, we also clear PACA_IRQ_HARD_DIS
+	 * or that bit can get out of sync and bad things will happen
+	 */
+.Lrestore_irq_off:
+	ld	r3,_MSR(r1)
+	lbz	r7,PACAIRQHAPPENED(r13)
+	andi.	r0,r3,MSR_EE
+	beq	1f
+	rlwinm	r7,r7,0,~PACA_IRQ_HARD_DIS
+	stb	r7,PACAIRQHAPPENED(r13)
+1:
+#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG)
+	/* The interrupt should not have soft enabled. */
+	lbz	r7,PACAIRQSOFTMASK(r13)
+1:	tdeqi	r7,IRQS_ENABLED
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
+#endif
+	b	.Ldo_restore
+
+	/*
+	 * Something did happen, check if a re-emit is needed
+	 * (this also clears paca->irq_happened)
+	 */
+.Lrestore_check_irq_replay:
+	/* XXX: We could implement a fast path here where we check
+	 * for irq_happened being just 0x01, in which case we can
+	 * clear it and return. That means that we would potentially
+	 * miss a decrementer having wrapped all the way around.
+	 *
+	 * Still, this might be useful for things like hash_page
+	 */
+	bl	__check_irq_replay
+	cmpwi	cr0,r3,0
+	beq	.Lrestore_no_replay
+ 
+	/*
+	 * We need to re-emit an interrupt. We do so by re-using our
+	 * existing exception frame. We first change the trap value,
+	 * but we need to ensure we preserve the low nibble of it
+	 */
+	ld	r4,_TRAP(r1)
+	clrldi	r4,r4,60
+	or	r4,r4,r3
+	std	r4,_TRAP(r1)
+
+	/*
+	 * PACA_IRQ_HARD_DIS won't always be set here, so set it now
+	 * to reconcile the IRQ state. Tracing is already accounted for.
+	 */
+	lbz	r4,PACAIRQHAPPENED(r13)
+	ori	r4,r4,PACA_IRQ_HARD_DIS
+	stb	r4,PACAIRQHAPPENED(r13)
+
+	/*
+	 * Then find the right handler and call it. Interrupts are
+	 * still soft-disabled and we keep them that way.
+	*/
+	cmpwi	cr0,r3,0x500
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+ 	bl	do_IRQ
+	b	ret_from_except
+1:	cmpwi	cr0,r3,0xf00
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	bl	performance_monitor_exception
+	b	ret_from_except
+1:	cmpwi	cr0,r3,0xe60
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	bl	handle_hmi_exception
+	b	ret_from_except
+1:	cmpwi	cr0,r3,0x900
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	bl	timer_interrupt
+	b	ret_from_except
+#ifdef CONFIG_PPC_DOORBELL
+1:
+#ifdef CONFIG_PPC_BOOK3E
+	cmpwi	cr0,r3,0x280
+#else
+	cmpwi	cr0,r3,0xa00
+#endif /* CONFIG_PPC_BOOK3E */
+	bne	1f
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	bl	doorbell_exception
+#endif /* CONFIG_PPC_DOORBELL */
+1:	b	ret_from_except /* What else to do here ? */
+ 
+.Lunrecov_restore:
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unrecoverable_exception
+	b	.Lunrecov_restore
+
+_ASM_NOKPROBE_SYMBOL(ret_from_except);
+_ASM_NOKPROBE_SYMBOL(ret_from_except_lite);
+_ASM_NOKPROBE_SYMBOL(resume_kernel);
+_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq);
+_ASM_NOKPROBE_SYMBOL(restore);
+_ASM_NOKPROBE_SYMBOL(fast_exception_return);
+
+
+#ifdef CONFIG_PPC_RTAS
+/*
+ * On CHRP, the Run-Time Abstraction Services (RTAS) have to be
+ * called with the MMU off.
+ *
+ * In addition, we need to be in 32b mode, at least for now.
+ * 
+ * Note: r3 is an input parameter to rtas, so don't trash it...
+ */
+_GLOBAL(enter_rtas)
+	mflr	r0
+	std	r0,16(r1)
+        stdu	r1,-RTAS_FRAME_SIZE(r1)	/* Save SP and create stack space. */
+
+	/* Because RTAS is running in 32b mode, it clobbers the high order half
+	 * of all registers that it saves.  We therefore save those registers
+	 * RTAS might touch to the stack.  (r0, r3-r13 are caller saved)
+   	 */
+	SAVE_GPR(2, r1)			/* Save the TOC */
+	SAVE_GPR(13, r1)		/* Save paca */
+	SAVE_8GPRS(14, r1)		/* Save the non-volatiles */
+	SAVE_10GPRS(22, r1)		/* ditto */
+
+	mfcr	r4
+	std	r4,_CCR(r1)
+	mfctr	r5
+	std	r5,_CTR(r1)
+	mfspr	r6,SPRN_XER
+	std	r6,_XER(r1)
+	mfdar	r7
+	std	r7,_DAR(r1)
+	mfdsisr	r8
+	std	r8,_DSISR(r1)
+
+	/* Temporary workaround to clear CR until RTAS can be modified to
+	 * ignore all bits.
+	 */
+	li	r0,0
+	mtcr	r0
+
+#ifdef CONFIG_BUG
+	/* There is no way it is acceptable to get here with interrupts enabled,
+	 * check it with the asm equivalent of WARN_ON
+	 */
+	lbz	r0,PACAIRQSOFTMASK(r13)
+1:	tdeqi	r0,IRQS_ENABLED
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
+#endif
+
+	/* Hard-disable interrupts */
+	mfmsr	r6
+	rldicl	r7,r6,48,1
+	rotldi	r7,r7,16
+	mtmsrd	r7,1
+
+	/* Unfortunately, the stack pointer and the MSR are also clobbered,
+	 * so they are saved in the PACA which allows us to restore
+	 * our original state after RTAS returns.
+         */
+	std	r1,PACAR1(r13)
+        std	r6,PACASAVEDMSR(r13)
+
+	/* Setup our real return addr */	
+	LOAD_REG_ADDR(r4,rtas_return_loc)
+	clrldi	r4,r4,2			/* convert to realmode address */
+       	mtlr	r4
+
+	li	r0,0
+	ori	r0,r0,MSR_EE|MSR_SE|MSR_BE|MSR_RI
+	andc	r0,r6,r0
+	
+        li      r9,1
+        rldicr  r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
+	ori	r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE
+	andc	r6,r0,r9
+
+__enter_rtas:
+	sync				/* disable interrupts so SRR0/1 */
+	mtmsrd	r0			/* don't get trashed */
+
+	LOAD_REG_ADDR(r4, rtas)
+	ld	r5,RTASENTRY(r4)	/* get the rtas->entry value */
+	ld	r4,RTASBASE(r4)		/* get the rtas->base value */
+	
+	mtspr	SPRN_SRR0,r5
+	mtspr	SPRN_SRR1,r6
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+
+rtas_return_loc:
+	FIXUP_ENDIAN
+
+	/*
+	 * Clear RI and set SF before anything.
+	 */
+	mfmsr   r6
+	li	r0,MSR_RI
+	andc	r6,r6,r0
+	sldi	r0,r0,(MSR_SF_LG - MSR_RI_LG)
+	or	r6,r6,r0
+	sync
+	mtmsrd  r6
+
+	/* relocation is off at this point */
+	GET_PACA(r4)
+	clrldi	r4,r4,2			/* convert to realmode address */
+
+	bcl	20,31,$+4
+0:	mflr	r3
+	ld	r3,(1f-0b)(r3)		/* get &rtas_restore_regs */
+
+        ld	r1,PACAR1(r4)           /* Restore our SP */
+        ld	r4,PACASAVEDMSR(r4)     /* Restore our MSR */
+
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+_ASM_NOKPROBE_SYMBOL(__enter_rtas)
+_ASM_NOKPROBE_SYMBOL(rtas_return_loc)
+
+	.align	3
+1:	.8byte	rtas_restore_regs
+
+rtas_restore_regs:
+	/* relocation is on at this point */
+	REST_GPR(2, r1)			/* Restore the TOC */
+	REST_GPR(13, r1)		/* Restore paca */
+	REST_8GPRS(14, r1)		/* Restore the non-volatiles */
+	REST_10GPRS(22, r1)		/* ditto */
+
+	GET_PACA(r13)
+
+	ld	r4,_CCR(r1)
+	mtcr	r4
+	ld	r5,_CTR(r1)
+	mtctr	r5
+	ld	r6,_XER(r1)
+	mtspr	SPRN_XER,r6
+	ld	r7,_DAR(r1)
+	mtdar	r7
+	ld	r8,_DSISR(r1)
+	mtdsisr	r8
+
+        addi	r1,r1,RTAS_FRAME_SIZE	/* Unstack our frame */
+	ld	r0,16(r1)		/* get return address */
+
+	mtlr    r0
+        blr				/* return to caller */
+
+#endif /* CONFIG_PPC_RTAS */
+
+_GLOBAL(enter_prom)
+	mflr	r0
+	std	r0,16(r1)
+        stdu	r1,-PROM_FRAME_SIZE(r1)	/* Save SP and create stack space */
+
+	/* Because PROM is running in 32b mode, it clobbers the high order half
+	 * of all registers that it saves.  We therefore save those registers
+	 * PROM might touch to the stack.  (r0, r3-r13 are caller saved)
+   	 */
+	SAVE_GPR(2, r1)
+	SAVE_GPR(13, r1)
+	SAVE_8GPRS(14, r1)
+	SAVE_10GPRS(22, r1)
+	mfcr	r10
+	mfmsr	r11
+	std	r10,_CCR(r1)
+	std	r11,_MSR(r1)
+
+	/* Put PROM address in SRR0 */
+	mtsrr0	r4
+
+	/* Setup our trampoline return addr in LR */
+	bcl	20,31,$+4
+0:	mflr	r4
+	addi	r4,r4,(1f - 0b)
+       	mtlr	r4
+
+	/* Prepare a 32-bit mode big endian MSR
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	rlwinm	r11,r11,0,1,31
+	mtsrr1	r11
+	rfi
+#else /* CONFIG_PPC_BOOK3E */
+	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
+	andc	r11,r11,r12
+	mtsrr1	r11
+	RFI_TO_KERNEL
+#endif /* CONFIG_PPC_BOOK3E */
+
+1:	/* Return from OF */
+	FIXUP_ENDIAN
+
+	/* Just make sure that r1 top 32 bits didn't get
+	 * corrupt by OF
+	 */
+	rldicl	r1,r1,0,32
+
+	/* Restore the MSR (back to 64 bits) */
+	ld	r0,_MSR(r1)
+	MTMSRD(r0)
+        isync
+
+	/* Restore other registers */
+	REST_GPR(2, r1)
+	REST_GPR(13, r1)
+	REST_8GPRS(14, r1)
+	REST_10GPRS(22, r1)
+	ld	r4,_CCR(r1)
+	mtcr	r4
+	
+        addi	r1,r1,PROM_FRAME_SIZE
+	ld	r0,16(r1)
+	mtlr    r0
+        blr
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
new file mode 100644
index 000000000..52ca2471e
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+#include <asm/export.h>
+
+#ifndef CONFIG_PPC64
+/* epapr_ev_idle() was derived from e500_idle() */
+_GLOBAL(epapr_ev_idle)
+	CURRENT_THREAD_INFO(r3, r1)
+	PPC_LL	r4, TI_LOCAL_FLAGS(r3)	/* set napping bit */
+	ori	r4, r4,_TLF_NAPPING	/* so when we take an exception */
+	PPC_STL	r4, TI_LOCAL_FLAGS(r3)	/* it will return to our caller */
+
+	wrteei	1
+
+idle_loop:
+	LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+
+	/*
+	 * Guard against spurious wakeups from a hypervisor --
+	 * only interrupt will cause us to return to LR due to
+	 * _TLF_NAPPING.
+	 */
+	b	idle_loop
+#endif
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+.global epapr_hypercall_start
+epapr_hypercall_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+	blr
+EXPORT_SYMBOL(epapr_hypercall_start)
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
new file mode 100644
index 000000000..59e4ba749
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -0,0 +1,85 @@
+/*
+ * ePAPR para-virtualization support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ */
+
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/machdep.h>
+
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+extern void epapr_ev_idle(void);
+extern u32 epapr_ev_idle_start[];
+#endif
+
+bool epapr_paravirt_enabled;
+static bool __maybe_unused epapr_has_idle;
+
+static int __init early_init_dt_scan_epapr(unsigned long node,
+					   const char *uname,
+					   int depth, void *data)
+{
+	const u32 *insts;
+	int len;
+	int i;
+
+	insts = of_get_flat_dt_prop(node, "hcall-instructions", &len);
+	if (!insts)
+		return 0;
+
+	if (len % 4 || len > (4 * 4))
+		return -1;
+
+	for (i = 0; i < (len / 4); i++) {
+		u32 inst = be32_to_cpu(insts[i]);
+		patch_instruction(epapr_hypercall_start + i, inst);
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+		patch_instruction(epapr_ev_idle_start + i, inst);
+#endif
+	}
+
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+	if (of_get_flat_dt_prop(node, "has-idle", NULL))
+		epapr_has_idle = true;
+#endif
+
+	epapr_paravirt_enabled = true;
+
+	return 1;
+}
+
+int __init epapr_paravirt_early_init(void)
+{
+	of_scan_flat_dt(early_init_dt_scan_epapr, NULL);
+
+	return 0;
+}
+
+static int __init epapr_idle_init(void)
+{
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+	if (epapr_has_idle)
+		ppc_md.power_save = epapr_ev_idle;
+#endif
+
+	return 0;
+}
+
+postcore_initcall(epapr_idle_init);
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
new file mode 100644
index 000000000..447defdd4
--- /dev/null
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -0,0 +1,1702 @@
+/*
+ *  Boot code and exception vectors for Book3E processors
+ *
+ *  Copyright (C) 2007 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/setup.h>
+#include <asm/thread_info.h>
+#include <asm/reg_a2.h>
+#include <asm/exception-64e.h>
+#include <asm/bug.h>
+#include <asm/irqflags.h>
+#include <asm/ptrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/mmu.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+#include <asm/feature-fixups.h>
+
+/* XXX This will ultimately add space for a special exception save
+ *     structure used to save things like SRR0/SRR1, SPRGs, MAS, etc...
+ *     when taking special interrupts. For now we don't support that,
+ *     special interrupts from within a non-standard level will probably
+ *     blow you up
+ */
+#define SPECIAL_EXC_SRR0	0
+#define SPECIAL_EXC_SRR1	1
+#define SPECIAL_EXC_SPRG_GEN	2
+#define SPECIAL_EXC_SPRG_TLB	3
+#define SPECIAL_EXC_MAS0	4
+#define SPECIAL_EXC_MAS1	5
+#define SPECIAL_EXC_MAS2	6
+#define SPECIAL_EXC_MAS3	7
+#define SPECIAL_EXC_MAS6	8
+#define SPECIAL_EXC_MAS7	9
+#define SPECIAL_EXC_MAS5	10	/* E.HV only */
+#define SPECIAL_EXC_MAS8	11	/* E.HV only */
+#define SPECIAL_EXC_IRQHAPPENED	12
+#define SPECIAL_EXC_DEAR	13
+#define SPECIAL_EXC_ESR		14
+#define SPECIAL_EXC_SOFTE	15
+#define SPECIAL_EXC_CSRR0	16
+#define SPECIAL_EXC_CSRR1	17
+/* must be even to keep 16-byte stack alignment */
+#define SPECIAL_EXC_END		18
+
+#define SPECIAL_EXC_FRAME_SIZE	(INT_FRAME_SIZE + SPECIAL_EXC_END * 8)
+#define SPECIAL_EXC_FRAME_OFFS  (INT_FRAME_SIZE - 288)
+
+#define SPECIAL_EXC_STORE(reg, name) \
+	std	reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1)
+
+#define SPECIAL_EXC_LOAD(reg, name) \
+	ld	reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1)
+
+special_reg_save:
+	lbz	r9,PACAIRQHAPPENED(r13)
+	RECONCILE_IRQ_STATE(r3,r4)
+
+	/*
+	 * We only need (or have stack space) to save this stuff if
+	 * we interrupted the kernel.
+	 */
+	ld	r3,_MSR(r1)
+	andi.	r3,r3,MSR_PR
+	bnelr
+
+	/* Copy info into temporary exception thread info */
+	ld	r11,PACAKSAVE(r13)
+	CURRENT_THREAD_INFO(r11, r11)
+	CURRENT_THREAD_INFO(r12, r1)
+	ld	r10,TI_FLAGS(r11)
+	std	r10,TI_FLAGS(r12)
+	ld	r10,TI_PREEMPT(r11)
+	std	r10,TI_PREEMPT(r12)
+	ld	r10,TI_TASK(r11)
+	std	r10,TI_TASK(r12)
+
+	/*
+	 * Advance to the next TLB exception frame for handler
+	 * types that don't do it automatically.
+	 */
+	LOAD_REG_ADDR(r11,extlb_level_exc)
+	lwz	r12,0(r11)
+	mfspr	r10,SPRN_SPRG_TLB_EXFRAME
+	add	r10,r10,r12
+	mtspr	SPRN_SPRG_TLB_EXFRAME,r10
+
+	/*
+	 * Save registers needed to allow nesting of certain exceptions
+	 * (such as TLB misses) inside special exception levels
+	 */
+	mfspr	r10,SPRN_SRR0
+	SPECIAL_EXC_STORE(r10,SRR0)
+	mfspr	r10,SPRN_SRR1
+	SPECIAL_EXC_STORE(r10,SRR1)
+	mfspr	r10,SPRN_SPRG_GEN_SCRATCH
+	SPECIAL_EXC_STORE(r10,SPRG_GEN)
+	mfspr	r10,SPRN_SPRG_TLB_SCRATCH
+	SPECIAL_EXC_STORE(r10,SPRG_TLB)
+	mfspr	r10,SPRN_MAS0
+	SPECIAL_EXC_STORE(r10,MAS0)
+	mfspr	r10,SPRN_MAS1
+	SPECIAL_EXC_STORE(r10,MAS1)
+	mfspr	r10,SPRN_MAS2
+	SPECIAL_EXC_STORE(r10,MAS2)
+	mfspr	r10,SPRN_MAS3
+	SPECIAL_EXC_STORE(r10,MAS3)
+	mfspr	r10,SPRN_MAS6
+	SPECIAL_EXC_STORE(r10,MAS6)
+	mfspr	r10,SPRN_MAS7
+	SPECIAL_EXC_STORE(r10,MAS7)
+BEGIN_FTR_SECTION
+	mfspr	r10,SPRN_MAS5
+	SPECIAL_EXC_STORE(r10,MAS5)
+	mfspr	r10,SPRN_MAS8
+	SPECIAL_EXC_STORE(r10,MAS8)
+
+	/* MAS5/8 could have inappropriate values if we interrupted KVM code */
+	li	r10,0
+	mtspr	SPRN_MAS5,r10
+	mtspr	SPRN_MAS8,r10
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+	SPECIAL_EXC_STORE(r9,IRQHAPPENED)
+
+	mfspr	r10,SPRN_DEAR
+	SPECIAL_EXC_STORE(r10,DEAR)
+	mfspr	r10,SPRN_ESR
+	SPECIAL_EXC_STORE(r10,ESR)
+
+	lbz	r10,PACAIRQSOFTMASK(r13)
+	SPECIAL_EXC_STORE(r10,SOFTE)
+	ld	r10,_NIP(r1)
+	SPECIAL_EXC_STORE(r10,CSRR0)
+	ld	r10,_MSR(r1)
+	SPECIAL_EXC_STORE(r10,CSRR1)
+
+	blr
+
+ret_from_level_except:
+	ld	r3,_MSR(r1)
+	andi.	r3,r3,MSR_PR
+	beq	1f
+	b	ret_from_except
+1:
+
+	LOAD_REG_ADDR(r11,extlb_level_exc)
+	lwz	r12,0(r11)
+	mfspr	r10,SPRN_SPRG_TLB_EXFRAME
+	sub	r10,r10,r12
+	mtspr	SPRN_SPRG_TLB_EXFRAME,r10
+
+	/*
+	 * It's possible that the special level exception interrupted a
+	 * TLB miss handler, and inserted the same entry that the
+	 * interrupted handler was about to insert.  On CPUs without TLB
+	 * write conditional, this can result in a duplicate TLB entry.
+	 * Wipe all non-bolted entries to be safe.
+	 *
+	 * Note that this doesn't protect against any TLB misses
+	 * we may take accessing the stack from here to the end of
+	 * the special level exception.  It's not clear how we can
+	 * reasonably protect against that, but only CPUs with
+	 * neither TLB write conditional nor bolted kernel memory
+	 * are affected.  Do any such CPUs even exist?
+	 */
+	PPC_TLBILX_ALL(0,R0)
+
+	REST_NVGPRS(r1)
+
+	SPECIAL_EXC_LOAD(r10,SRR0)
+	mtspr	SPRN_SRR0,r10
+	SPECIAL_EXC_LOAD(r10,SRR1)
+	mtspr	SPRN_SRR1,r10
+	SPECIAL_EXC_LOAD(r10,SPRG_GEN)
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r10
+	SPECIAL_EXC_LOAD(r10,SPRG_TLB)
+	mtspr	SPRN_SPRG_TLB_SCRATCH,r10
+	SPECIAL_EXC_LOAD(r10,MAS0)
+	mtspr	SPRN_MAS0,r10
+	SPECIAL_EXC_LOAD(r10,MAS1)
+	mtspr	SPRN_MAS1,r10
+	SPECIAL_EXC_LOAD(r10,MAS2)
+	mtspr	SPRN_MAS2,r10
+	SPECIAL_EXC_LOAD(r10,MAS3)
+	mtspr	SPRN_MAS3,r10
+	SPECIAL_EXC_LOAD(r10,MAS6)
+	mtspr	SPRN_MAS6,r10
+	SPECIAL_EXC_LOAD(r10,MAS7)
+	mtspr	SPRN_MAS7,r10
+BEGIN_FTR_SECTION
+	SPECIAL_EXC_LOAD(r10,MAS5)
+	mtspr	SPRN_MAS5,r10
+	SPECIAL_EXC_LOAD(r10,MAS8)
+	mtspr	SPRN_MAS8,r10
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+
+	lbz	r6,PACAIRQSOFTMASK(r13)
+	ld	r5,SOFTE(r1)
+
+	/* Interrupts had better not already be enabled... */
+	tweqi	r6,IRQS_ENABLED
+
+	andi.	r6,r5,IRQS_DISABLED
+	bne	1f
+
+	TRACE_ENABLE_INTS
+	stb	r5,PACAIRQSOFTMASK(r13)
+1:
+	/*
+	 * Restore PACAIRQHAPPENED rather than setting it based on
+	 * the return MSR[EE], since we could have interrupted
+	 * __check_irq_replay() or other inconsistent transitory
+	 * states that must remain that way.
+	 */
+	SPECIAL_EXC_LOAD(r10,IRQHAPPENED)
+	stb	r10,PACAIRQHAPPENED(r13)
+
+	SPECIAL_EXC_LOAD(r10,DEAR)
+	mtspr	SPRN_DEAR,r10
+	SPECIAL_EXC_LOAD(r10,ESR)
+	mtspr	SPRN_ESR,r10
+
+	stdcx.	r0,0,r1		/* to clear the reservation */
+
+	REST_4GPRS(2, r1)
+	REST_4GPRS(6, r1)
+
+	ld	r10,_CTR(r1)
+	ld	r11,_XER(r1)
+	mtctr	r10
+	mtxer	r11
+
+	blr
+
+.macro ret_from_level srr0 srr1 paca_ex scratch
+	bl	ret_from_level_except
+
+	ld	r10,_LINK(r1)
+	ld	r11,_CCR(r1)
+	ld	r0,GPR13(r1)
+	mtlr	r10
+	mtcr	r11
+
+	ld	r10,GPR10(r1)
+	ld	r11,GPR11(r1)
+	ld	r12,GPR12(r1)
+	mtspr	\scratch,r0
+
+	std	r10,\paca_ex+EX_R10(r13);
+	std	r11,\paca_ex+EX_R11(r13);
+	ld	r10,_NIP(r1)
+	ld	r11,_MSR(r1)
+	ld	r0,GPR0(r1)
+	ld	r1,GPR1(r1)
+	mtspr	\srr0,r10
+	mtspr	\srr1,r11
+	ld	r10,\paca_ex+EX_R10(r13)
+	ld	r11,\paca_ex+EX_R11(r13)
+	mfspr	r13,\scratch
+.endm
+
+ret_from_crit_except:
+	ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH
+	rfci
+
+ret_from_mc_except:
+	ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH
+	rfmci
+
+/* Exception prolog code for all exceptions */
+#define EXCEPTION_PROLOG(n, intnum, type, addition)	    		    \
+	mtspr	SPRN_SPRG_##type##_SCRATCH,r13;	/* get spare registers */   \
+	mfspr	r13,SPRN_SPRG_PACA;	/* get PACA */			    \
+	std	r10,PACA_EX##type+EX_R10(r13);				    \
+	std	r11,PACA_EX##type+EX_R11(r13);				    \
+	mfcr	r10;			/* save CR */			    \
+	mfspr	r11,SPRN_##type##_SRR1;/* what are we coming from */	    \
+	DO_KVM	intnum,SPRN_##type##_SRR1;    /* KVM hook */		    \
+	stw	r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \
+	addition;			/* additional code for that exc. */ \
+	std	r1,PACA_EX##type+EX_R1(r13); /* save old r1 in the PACA */  \
+	type##_SET_KSTACK;		/* get special stack if necessary */\
+	andi.	r10,r11,MSR_PR;		/* save stack pointer */	    \
+	beq	1f;			/* branch around if supervisor */   \
+	ld	r1,PACAKSAVE(r13);	/* get kernel stack coming from usr */\
+1:	type##_BTB_FLUSH		\
+	cmpdi	cr1,r1,0;		/* check if SP makes sense */	    \
+	bge-	cr1,exc_##n##_bad_stack;/* bad stack (TODO: out of line) */ \
+	mfspr	r10,SPRN_##type##_SRR0;	/* read SRR0 before touching stack */
+
+/* Exception type-specific macros */
+#define	GEN_SET_KSTACK							    \
+	subi	r1,r1,INT_FRAME_SIZE;	/* alloc frame on kernel stack */
+#define SPRN_GEN_SRR0	SPRN_SRR0
+#define SPRN_GEN_SRR1	SPRN_SRR1
+
+#define	GDBELL_SET_KSTACK	GEN_SET_KSTACK
+#define SPRN_GDBELL_SRR0	SPRN_GSRR0
+#define SPRN_GDBELL_SRR1	SPRN_GSRR1
+
+#define CRIT_SET_KSTACK						            \
+	ld	r1,PACA_CRIT_STACK(r13);				    \
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE
+#define SPRN_CRIT_SRR0	SPRN_CSRR0
+#define SPRN_CRIT_SRR1	SPRN_CSRR1
+
+#define DBG_SET_KSTACK						            \
+	ld	r1,PACA_DBG_STACK(r13);					    \
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE
+#define SPRN_DBG_SRR0	SPRN_DSRR0
+#define SPRN_DBG_SRR1	SPRN_DSRR1
+
+#define MC_SET_KSTACK						            \
+	ld	r1,PACA_MC_STACK(r13);					    \
+	subi	r1,r1,SPECIAL_EXC_FRAME_SIZE
+#define SPRN_MC_SRR0	SPRN_MCSRR0
+#define SPRN_MC_SRR1	SPRN_MCSRR1
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define GEN_BTB_FLUSH			\
+	START_BTB_FLUSH_SECTION		\
+		beq 1f;			\
+		BTB_FLUSH(r10)			\
+		1:		\
+	END_BTB_FLUSH_SECTION
+
+#define CRIT_BTB_FLUSH			\
+	START_BTB_FLUSH_SECTION		\
+		BTB_FLUSH(r10)		\
+	END_BTB_FLUSH_SECTION
+
+#define DBG_BTB_FLUSH CRIT_BTB_FLUSH
+#define MC_BTB_FLUSH CRIT_BTB_FLUSH
+#define GDBELL_BTB_FLUSH GEN_BTB_FLUSH
+#else
+#define GEN_BTB_FLUSH
+#define CRIT_BTB_FLUSH
+#define DBG_BTB_FLUSH
+#define MC_BTB_FLUSH
+#define GDBELL_BTB_FLUSH
+#endif
+
+#define NORMAL_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, GEN, addition##_GEN(n))
+
+#define CRIT_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, CRIT, addition##_CRIT(n))
+
+#define DBG_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, DBG, addition##_DBG(n))
+
+#define MC_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, MC, addition##_MC(n))
+
+#define GDBELL_EXCEPTION_PROLOG(n, intnum, addition)			    \
+	EXCEPTION_PROLOG(n, intnum, GDBELL, addition##_GDBELL(n))
+
+/* Variants of the "addition" argument for the prolog
+ */
+#define PROLOG_ADDITION_NONE_GEN(n)
+#define PROLOG_ADDITION_NONE_GDBELL(n)
+#define PROLOG_ADDITION_NONE_CRIT(n)
+#define PROLOG_ADDITION_NONE_DBG(n)
+#define PROLOG_ADDITION_NONE_MC(n)
+
+#define PROLOG_ADDITION_MASKABLE_GEN(n)					    \
+	lbz	r10,PACAIRQSOFTMASK(r13);	/* are irqs soft-masked? */ \
+	andi.	r10,r10,IRQS_DISABLED;	/* yes -> go out of line */ \
+	bne	masked_interrupt_book3e_##n
+
+#define PROLOG_ADDITION_2REGS_GEN(n)					    \
+	std	r14,PACA_EXGEN+EX_R14(r13);				    \
+	std	r15,PACA_EXGEN+EX_R15(r13)
+
+#define PROLOG_ADDITION_1REG_GEN(n)					    \
+	std	r14,PACA_EXGEN+EX_R14(r13);
+
+#define PROLOG_ADDITION_2REGS_CRIT(n)					    \
+	std	r14,PACA_EXCRIT+EX_R14(r13);				    \
+	std	r15,PACA_EXCRIT+EX_R15(r13)
+
+#define PROLOG_ADDITION_2REGS_DBG(n)					    \
+	std	r14,PACA_EXDBG+EX_R14(r13);				    \
+	std	r15,PACA_EXDBG+EX_R15(r13)
+
+#define PROLOG_ADDITION_2REGS_MC(n)					    \
+	std	r14,PACA_EXMC+EX_R14(r13);				    \
+	std	r15,PACA_EXMC+EX_R15(r13)
+
+
+/* Core exception code for all exceptions except TLB misses. */
+#define EXCEPTION_COMMON_LVL(n, scratch, excf)				    \
+exc_##n##_common:							    \
+	std	r0,GPR0(r1);		/* save r0 in stackframe */	    \
+	std	r2,GPR2(r1);		/* save r2 in stackframe */	    \
+	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe */    \
+	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe */	    \
+	std	r9,GPR9(r1);		/* save r9 in stackframe */	    \
+	std	r10,_NIP(r1);		/* save SRR0 to stackframe */	    \
+	std	r11,_MSR(r1);		/* save SRR1 to stackframe */	    \
+	beq	2f;			/* if from kernel mode */	    \
+	ACCOUNT_CPU_USER_ENTRY(r13,r10,r11);/* accounting (uses cr0+eq) */  \
+2:	ld	r3,excf+EX_R10(r13);	/* get back r10 */		    \
+	ld	r4,excf+EX_R11(r13);	/* get back r11 */		    \
+	mfspr	r5,scratch;		/* get back r13 */		    \
+	std	r12,GPR12(r1);		/* save r12 in stackframe */	    \
+	ld	r2,PACATOC(r13);	/* get kernel TOC into r2 */	    \
+	mflr	r6;			/* save LR in stackframe */	    \
+	mfctr	r7;			/* save CTR in stackframe */	    \
+	mfspr	r8,SPRN_XER;		/* save XER in stackframe */	    \
+	ld	r9,excf+EX_R1(r13);	/* load orig r1 back from PACA */   \
+	lwz	r10,excf+EX_CR(r13);	/* load orig CR back from PACA	*/  \
+	lbz	r11,PACAIRQSOFTMASK(r13); /* get current IRQ softe */	    \
+	ld	r12,exception_marker@toc(r2);				    \
+	li	r0,0;							    \
+	std	r3,GPR10(r1);		/* save r10 to stackframe */	    \
+	std	r4,GPR11(r1);		/* save r11 to stackframe */	    \
+	std	r5,GPR13(r1);		/* save it to stackframe */	    \
+	std	r6,_LINK(r1);						    \
+	std	r7,_CTR(r1);						    \
+	std	r8,_XER(r1);						    \
+	li	r3,(n)+1;		/* indicate partial regs in trap */ \
+	std	r9,0(r1);		/* store stack frame back link */   \
+	std	r10,_CCR(r1);		/* store orig CR in stackframe */   \
+	std	r9,GPR1(r1);		/* store stack frame back link */   \
+	std	r11,SOFTE(r1);		/* and save it to stackframe */     \
+	std	r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */	    \
+	std	r3,_TRAP(r1);		/* set trap number		*/  \
+	std	r0,RESULT(r1);		/* clear regs->result */
+
+#define EXCEPTION_COMMON(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN)
+#define EXCEPTION_COMMON_CRIT(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_CRIT_SCRATCH, PACA_EXCRIT)
+#define EXCEPTION_COMMON_MC(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_MC_SCRATCH, PACA_EXMC)
+#define EXCEPTION_COMMON_DBG(n) \
+	EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG)
+
+/*
+ * This is meant for exceptions that don't immediately hard-enable.  We
+ * set a bit in paca->irq_happened to ensure that a subsequent call to
+ * arch_local_irq_restore() will properly hard-enable and avoid the
+ * fast-path, and then reconcile irq state.
+ */
+#define INTS_DISABLE	RECONCILE_IRQ_STATE(r3,r4)
+
+/*
+ * This is called by exceptions that don't use INTS_DISABLE (that did not
+ * touch irq indicators in the PACA).  This will restore MSR:EE to it's
+ * previous value
+ *
+ * XXX In the long run, we may want to open-code it in order to separate the
+ *     load from the wrtee, thus limiting the latency caused by the dependency
+ *     but at this point, I'll favor code clarity until we have a near to final
+ *     implementation
+ */
+#define INTS_RESTORE_HARD						    \
+	ld	r11,_MSR(r1);						    \
+	wrtee	r11;
+
+/* XXX FIXME: Restore r14/r15 when necessary */
+#define BAD_STACK_TRAMPOLINE(n)						    \
+exc_##n##_bad_stack:							    \
+	li	r1,(n);			/* get exception number */	    \
+	sth	r1,PACA_TRAP_SAVE(r13);	/* store trap */		    \
+	b	bad_stack_book3e;	/* bad stack error */
+
+/* WARNING: If you change the layout of this stub, make sure you check
+	*   the debug exception handler which handles single stepping
+	*   into exceptions from userspace, and the MM code in
+	*   arch/powerpc/mm/tlb_nohash.c which patches the branch here
+	*   and would need to be updated if that branch is moved
+	*/
+#define	EXCEPTION_STUB(loc, label)					\
+	. = interrupt_base_book3e + loc;				\
+	nop;	/* To make debug interrupts happy */			\
+	b	exc_##label##_book3e;
+
+#define ACK_NONE(r)
+#define ACK_DEC(r)							\
+	lis	r,TSR_DIS@h;						\
+	mtspr	SPRN_TSR,r
+#define ACK_FIT(r)							\
+	lis	r,TSR_FIS@h;						\
+	mtspr	SPRN_TSR,r
+
+/* Used by asynchronous interrupt that may happen in the idle loop.
+ *
+ * This check if the thread was in the idle loop, and if yes, returns
+ * to the caller rather than the PC. This is to avoid a race if
+ * interrupts happen before the wait instruction.
+ */
+#define CHECK_NAPPING()							\
+	CURRENT_THREAD_INFO(r11, r1);					\
+	ld	r10,TI_LOCAL_FLAGS(r11);				\
+	andi.	r9,r10,_TLF_NAPPING;					\
+	beq+	1f;							\
+	ld	r8,_LINK(r1);						\
+	rlwinm	r7,r10,0,~_TLF_NAPPING;					\
+	std	r8,_NIP(r1);						\
+	std	r7,TI_LOCAL_FLAGS(r11);					\
+1:
+
+
+#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack)		\
+	START_EXCEPTION(label);						\
+	NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\
+	EXCEPTION_COMMON(trapnum)					\
+	INTS_DISABLE;							\
+	ack(r8);							\
+	CHECK_NAPPING();						\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
+	bl	hdlr;							\
+	b	ret_from_except_lite;
+
+/* This value is used to mark exception frames on the stack. */
+	.section	".toc","aw"
+exception_marker:
+	.tc	ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER
+
+
+/*
+ * And here we have the exception vectors !
+ */
+
+	.text
+	.balign	0x1000
+	.globl interrupt_base_book3e
+interrupt_base_book3e:					/* fake trap */
+	EXCEPTION_STUB(0x000, machine_check)
+	EXCEPTION_STUB(0x020, critical_input)		/* 0x0100 */
+	EXCEPTION_STUB(0x040, debug_crit)		/* 0x0d00 */
+	EXCEPTION_STUB(0x060, data_storage)		/* 0x0300 */
+	EXCEPTION_STUB(0x080, instruction_storage)	/* 0x0400 */
+	EXCEPTION_STUB(0x0a0, external_input)		/* 0x0500 */
+	EXCEPTION_STUB(0x0c0, alignment)		/* 0x0600 */
+	EXCEPTION_STUB(0x0e0, program)			/* 0x0700 */
+	EXCEPTION_STUB(0x100, fp_unavailable)		/* 0x0800 */
+	EXCEPTION_STUB(0x120, system_call)		/* 0x0c00 */
+	EXCEPTION_STUB(0x140, ap_unavailable)		/* 0x0f20 */
+	EXCEPTION_STUB(0x160, decrementer)		/* 0x0900 */
+	EXCEPTION_STUB(0x180, fixed_interval)		/* 0x0980 */
+	EXCEPTION_STUB(0x1a0, watchdog)			/* 0x09f0 */
+	EXCEPTION_STUB(0x1c0, data_tlb_miss)
+	EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+	EXCEPTION_STUB(0x200, altivec_unavailable)
+	EXCEPTION_STUB(0x220, altivec_assist)
+	EXCEPTION_STUB(0x260, perfmon)
+	EXCEPTION_STUB(0x280, doorbell)
+	EXCEPTION_STUB(0x2a0, doorbell_crit)
+	EXCEPTION_STUB(0x2c0, guest_doorbell)
+	EXCEPTION_STUB(0x2e0, guest_doorbell_crit)
+	EXCEPTION_STUB(0x300, hypercall)
+	EXCEPTION_STUB(0x320, ehpriv)
+	EXCEPTION_STUB(0x340, lrat_error)
+
+	.globl __end_interrupts
+__end_interrupts:
+
+/* Critical Input Interrupt */
+	START_EXCEPTION(critical_input);
+	CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x100)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_crit_except
+
+/* Machine Check Interrupt */
+	START_EXCEPTION(machine_check);
+	MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK,
+			    PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_MC(0x000)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	b	ret_from_mc_except
+
+/* Data Storage Interrupt */
+	START_EXCEPTION(data_storage)
+	NORMAL_EXCEPTION_PROLOG(0x300, BOOKE_INTERRUPT_DATA_STORAGE,
+				PROLOG_ADDITION_2REGS)
+	mfspr	r14,SPRN_DEAR
+	mfspr	r15,SPRN_ESR
+	EXCEPTION_COMMON(0x300)
+	INTS_DISABLE
+	b	storage_fault_common
+
+/* Instruction Storage Interrupt */
+	START_EXCEPTION(instruction_storage);
+	NORMAL_EXCEPTION_PROLOG(0x400, BOOKE_INTERRUPT_INST_STORAGE,
+				PROLOG_ADDITION_2REGS)
+	li	r15,0
+	mr	r14,r10
+	EXCEPTION_COMMON(0x400)
+	INTS_DISABLE
+	b	storage_fault_common
+
+/* External Input Interrupt */
+	MASKABLE_EXCEPTION(0x500, BOOKE_INTERRUPT_EXTERNAL,
+			   external_input, do_IRQ, ACK_NONE)
+
+/* Alignment */
+	START_EXCEPTION(alignment);
+	NORMAL_EXCEPTION_PROLOG(0x600, BOOKE_INTERRUPT_ALIGNMENT,
+				PROLOG_ADDITION_2REGS)
+	mfspr	r14,SPRN_DEAR
+	mfspr	r15,SPRN_ESR
+	EXCEPTION_COMMON(0x600)
+	b	alignment_more	/* no room, go out of line */
+
+/* Program Interrupt */
+	START_EXCEPTION(program);
+	NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM,
+				PROLOG_ADDITION_1REG)
+	mfspr	r14,SPRN_ESR
+	EXCEPTION_COMMON(0x700)
+	INTS_DISABLE
+	std	r14,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ld	r14,PACA_EXGEN+EX_R14(r13)
+	bl	save_nvgprs
+	bl	program_check_exception
+	b	ret_from_except
+
+/* Floating Point Unavailable Interrupt */
+	START_EXCEPTION(fp_unavailable);
+	NORMAL_EXCEPTION_PROLOG(0x800, BOOKE_INTERRUPT_FP_UNAVAIL,
+				PROLOG_ADDITION_NONE)
+	/* we can probably do a shorter exception entry for that one... */
+	EXCEPTION_COMMON(0x800)
+	ld	r12,_MSR(r1)
+	andi.	r0,r12,MSR_PR;
+	beq-	1f
+	bl	load_up_fpu
+	b	fast_exception_return
+1:	INTS_DISABLE
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	kernel_fp_unavailable_exception
+	b	ret_from_except
+
+/* Altivec Unavailable Interrupt */
+	START_EXCEPTION(altivec_unavailable);
+	NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_ALTIVEC_UNAVAIL,
+				PROLOG_ADDITION_NONE)
+	/* we can probably do a shorter exception entry for that one... */
+	EXCEPTION_COMMON(0x200)
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	ld	r12,_MSR(r1)
+	andi.	r0,r12,MSR_PR;
+	beq-	1f
+	bl	load_up_altivec
+	b	fast_exception_return
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	INTS_DISABLE
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	altivec_unavailable_exception
+	b	ret_from_except
+
+/* AltiVec Assist */
+	START_EXCEPTION(altivec_assist);
+	NORMAL_EXCEPTION_PROLOG(0x220,
+				BOOKE_INTERRUPT_ALTIVEC_ASSIST,
+				PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x220)
+	INTS_DISABLE
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	bl	altivec_assist_exception
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#else
+	bl	unknown_exception
+#endif
+	b	ret_from_except
+
+
+/* Decrementer Interrupt */
+	MASKABLE_EXCEPTION(0x900, BOOKE_INTERRUPT_DECREMENTER,
+			   decrementer, timer_interrupt, ACK_DEC)
+
+/* Fixed Interval Timer Interrupt */
+	MASKABLE_EXCEPTION(0x980, BOOKE_INTERRUPT_FIT,
+			   fixed_interval, unknown_exception, ACK_FIT)
+
+/* Watchdog Timer Interrupt */
+	START_EXCEPTION(watchdog);
+	CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x9f0)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_BOOKE_WDT
+	bl	WatchdogException
+#else
+	bl	unknown_exception
+#endif
+	b	ret_from_crit_except
+
+/* System Call Interrupt */
+	START_EXCEPTION(system_call)
+	mr	r9,r13			/* keep a copy of userland r13 */
+	mfspr	r11,SPRN_SRR0		/* get return address */
+	mfspr	r12,SPRN_SRR1		/* get previous MSR */
+	mfspr	r13,SPRN_SPRG_PACA	/* get our PACA */
+	b	system_call_common
+
+/* Auxiliary Processor Unavailable Interrupt */
+	START_EXCEPTION(ap_unavailable);
+	NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL,
+				PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0xf20)
+	INTS_DISABLE
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* Debug exception as a critical interrupt*/
+	START_EXCEPTION(debug_crit);
+	CRIT_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG,
+			      PROLOG_ADDITION_2REGS)
+
+	/*
+	 * If there is a single step or branch-taken exception in an
+	 * exception entry sequence, it was probably meant to apply to
+	 * the code where the exception occurred (since exception entry
+	 * doesn't turn off DE automatically).  We simulate the effect
+	 * of turning off DE on entry to an exception handler by turning
+	 * off DE in the CSRR1 value and clearing the debug status.
+	 */
+
+	mfspr	r14,SPRN_DBSR		/* check single-step/branch taken */
+	andis.	r15,r14,(DBSR_IC|DBSR_BT)@h
+	beq+	1f
+
+#ifdef CONFIG_RELOCATABLE
+	ld	r15,PACATOC(r13)
+	ld	r14,interrupt_base_book3e@got(r15)
+	ld	r15,__end_interrupts@got(r15)
+#else
+	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
+	LOAD_REG_IMMEDIATE(r15,__end_interrupts)
+#endif
+	cmpld	cr0,r10,r14
+	cmpld	cr1,r10,r15
+	blt+	cr0,1f
+	bge+	cr1,1f
+
+	/* here it looks like we got an inappropriate debug exception. */
+	lis	r14,(DBSR_IC|DBSR_BT)@h		/* clear the event */
+	rlwinm	r11,r11,0,~MSR_DE	/* clear DE in the CSRR1 value */
+	mtspr	SPRN_DBSR,r14
+	mtspr	SPRN_CSRR1,r11
+	lwz	r10,PACA_EXCRIT+EX_CR(r13)	/* restore registers */
+	ld	r1,PACA_EXCRIT+EX_R1(r13)
+	ld	r14,PACA_EXCRIT+EX_R14(r13)
+	ld	r15,PACA_EXCRIT+EX_R15(r13)
+	mtcr	r10
+	ld	r10,PACA_EXCRIT+EX_R10(r13)	/* restore registers */
+	ld	r11,PACA_EXCRIT+EX_R11(r13)
+	mfspr	r13,SPRN_SPRG_CRIT_SCRATCH
+	rfci
+
+	/* Normal debug exception */
+	/* XXX We only handle coming from userspace for now since we can't
+	 *     quite save properly an interrupted kernel state yet
+	 */
+1:	andi.	r14,r11,MSR_PR;		/* check for userspace again */
+	beq	kernel_dbg_exc;		/* if from kernel mode */
+
+	/* Now we mash up things to make it look like we are coming on a
+	 * normal exception
+	 */
+	mfspr	r14,SPRN_DBSR
+	EXCEPTION_COMMON_CRIT(0xd00)
+	std	r14,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	mr	r4,r14
+	ld	r14,PACA_EXCRIT+EX_R14(r13)
+	ld	r15,PACA_EXCRIT+EX_R15(r13)
+	bl	save_nvgprs
+	bl	DebugException
+	b	ret_from_except
+
+kernel_dbg_exc:
+	b	.	/* NYI */
+
+/* Debug exception as a debug interrupt*/
+	START_EXCEPTION(debug_debug);
+	DBG_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG,
+						 PROLOG_ADDITION_2REGS)
+
+	/*
+	 * If there is a single step or branch-taken exception in an
+	 * exception entry sequence, it was probably meant to apply to
+	 * the code where the exception occurred (since exception entry
+	 * doesn't turn off DE automatically).  We simulate the effect
+	 * of turning off DE on entry to an exception handler by turning
+	 * off DE in the DSRR1 value and clearing the debug status.
+	 */
+
+	mfspr	r14,SPRN_DBSR		/* check single-step/branch taken */
+	andis.	r15,r14,(DBSR_IC|DBSR_BT)@h
+	beq+	1f
+
+#ifdef CONFIG_RELOCATABLE
+	ld	r15,PACATOC(r13)
+	ld	r14,interrupt_base_book3e@got(r15)
+	ld	r15,__end_interrupts@got(r15)
+#else
+	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
+	LOAD_REG_IMMEDIATE(r15,__end_interrupts)
+#endif
+	cmpld	cr0,r10,r14
+	cmpld	cr1,r10,r15
+	blt+	cr0,1f
+	bge+	cr1,1f
+
+	/* here it looks like we got an inappropriate debug exception. */
+	lis	r14,(DBSR_IC|DBSR_BT)@h		/* clear the event */
+	rlwinm	r11,r11,0,~MSR_DE	/* clear DE in the DSRR1 value */
+	mtspr	SPRN_DBSR,r14
+	mtspr	SPRN_DSRR1,r11
+	lwz	r10,PACA_EXDBG+EX_CR(r13)	/* restore registers */
+	ld	r1,PACA_EXDBG+EX_R1(r13)
+	ld	r14,PACA_EXDBG+EX_R14(r13)
+	ld	r15,PACA_EXDBG+EX_R15(r13)
+	mtcr	r10
+	ld	r10,PACA_EXDBG+EX_R10(r13)	/* restore registers */
+	ld	r11,PACA_EXDBG+EX_R11(r13)
+	mfspr	r13,SPRN_SPRG_DBG_SCRATCH
+	rfdi
+
+	/* Normal debug exception */
+	/* XXX We only handle coming from userspace for now since we can't
+	 *     quite save properly an interrupted kernel state yet
+	 */
+1:	andi.	r14,r11,MSR_PR;		/* check for userspace again */
+	beq	kernel_dbg_exc;		/* if from kernel mode */
+
+	/* Now we mash up things to make it look like we are coming on a
+	 * normal exception
+	 */
+	mfspr	r14,SPRN_DBSR
+	EXCEPTION_COMMON_DBG(0xd08)
+	INTS_DISABLE
+	std	r14,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	mr	r4,r14
+	ld	r14,PACA_EXDBG+EX_R14(r13)
+	ld	r15,PACA_EXDBG+EX_R15(r13)
+	bl	save_nvgprs
+	bl	DebugException
+	b	ret_from_except
+
+	START_EXCEPTION(perfmon);
+	NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR,
+				PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x260)
+	INTS_DISABLE
+	CHECK_NAPPING()
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	performance_monitor_exception
+	b	ret_from_except_lite
+
+/* Doorbell interrupt */
+	MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL,
+			   doorbell, doorbell_exception, ACK_NONE)
+
+/* Doorbell critical Interrupt */
+	START_EXCEPTION(doorbell_crit);
+	CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x2a0)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_crit_except
+
+/*
+ *	Guest doorbell interrupt
+ *	This general exception use GSRRx save/restore registers
+ */
+	START_EXCEPTION(guest_doorbell);
+	GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x2c0)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* Guest Doorbell critical Interrupt */
+	START_EXCEPTION(guest_doorbell_crit);
+	CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT,
+			      PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON_CRIT(0x2e0)
+	bl	save_nvgprs
+	bl	special_reg_save
+	CHECK_NAPPING();
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unknown_exception
+	b	ret_from_crit_except
+
+/* Hypervisor call */
+	START_EXCEPTION(hypercall);
+	NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x310)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* Embedded Hypervisor priviledged  */
+	START_EXCEPTION(ehpriv);
+	NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x320)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/* LRAT Error interrupt */
+	START_EXCEPTION(lrat_error);
+	NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x340)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	unknown_exception
+	b	ret_from_except
+
+/*
+ * An interrupt came in while soft-disabled; We mark paca->irq_happened
+ * accordingly and if the interrupt is level sensitive, we hard disable
+ * hard disable (full_mask) corresponds to PACA_IRQ_MUST_HARD_MASK, so
+ * keep these in synch.
+ */
+
+.macro masked_interrupt_book3e paca_irq full_mask
+	lbz	r10,PACAIRQHAPPENED(r13)
+	.if \full_mask == 1
+	ori	r10,r10,\paca_irq | PACA_IRQ_HARD_DIS
+	.else
+	ori	r10,r10,\paca_irq
+	.endif
+	stb	r10,PACAIRQHAPPENED(r13)
+
+	.if \full_mask == 1
+	rldicl	r10,r11,48,1		/* clear MSR_EE */
+	rotldi	r11,r10,16
+	mtspr	SPRN_SRR1,r11
+	.endif
+
+	lwz	r11,PACA_EXGEN+EX_CR(r13)
+	mtcr	r11
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	mfspr	r13,SPRN_SPRG_GEN_SCRATCH
+	rfi
+	b	.
+.endm
+
+masked_interrupt_book3e_0x500:
+	// XXX When adding support for EPR, use PACA_IRQ_EE_EDGE
+	masked_interrupt_book3e PACA_IRQ_EE 1
+
+masked_interrupt_book3e_0x900:
+	ACK_DEC(r10);
+	masked_interrupt_book3e PACA_IRQ_DEC 0
+
+masked_interrupt_book3e_0x980:
+	ACK_FIT(r10);
+	masked_interrupt_book3e PACA_IRQ_DEC 0
+
+masked_interrupt_book3e_0x280:
+masked_interrupt_book3e_0x2c0:
+	masked_interrupt_book3e PACA_IRQ_DBELL 0
+
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280
+ * to indicate the kind of interrupt. MSR:EE is already off.
+ * We generate a stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ */
+_GLOBAL(__replay_interrupt)
+	/* We are going to jump to the exception common code which
+	 * will retrieve various register values from the PACA which
+	 * we don't give a damn about.
+	 */
+	mflr	r10
+	mfmsr	r11
+	mfcr	r4
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r13;
+	std	r1,PACA_EXGEN+EX_R1(r13);
+	stw	r4,PACA_EXGEN+EX_CR(r13);
+	ori	r11,r11,MSR_EE
+	subi	r1,r1,INT_FRAME_SIZE;
+	cmpwi	cr0,r3,0x500
+	beq	exc_0x500_common
+	cmpwi	cr0,r3,0x900
+	beq	exc_0x900_common
+	cmpwi	cr0,r3,0x280
+	beq	exc_0x280_common
+	blr
+
+
+/*
+ * This is called from 0x300 and 0x400 handlers after the prologs with
+ * r14 and r15 containing the fault address and error code, with the
+ * original values stashed away in the PACA
+ */
+storage_fault_common:
+	std	r14,_DAR(r1)
+	std	r15,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	mr	r4,r14
+	mr	r5,r15
+	ld	r14,PACA_EXGEN+EX_R14(r13)
+	ld	r15,PACA_EXGEN+EX_R15(r13)
+	bl	do_page_fault
+	cmpdi	r3,0
+	bne-	1f
+	b	ret_from_except_lite
+1:	bl	save_nvgprs
+	mr	r5,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ld	r4,_DAR(r1)
+	bl	bad_page_fault
+	b	ret_from_except
+
+/*
+ * Alignment exception doesn't fit entirely in the 0x100 bytes so it
+ * continues here.
+ */
+alignment_more:
+	std	r14,_DAR(r1)
+	std	r15,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ld	r14,PACA_EXGEN+EX_R14(r13)
+	ld	r15,PACA_EXGEN+EX_R15(r13)
+	bl	save_nvgprs
+	INTS_RESTORE_HARD
+	bl	alignment_exception
+	b	ret_from_except
+
+/*
+ * We branch here from entry_64.S for the last stage of the exception
+ * return code path. MSR:EE is expected to be off at that point
+ */
+_GLOBAL(exception_return_book3e)
+	b	1f
+
+/* This is the return from load_up_fpu fast path which could do with
+ * less GPR restores in fact, but for now we have a single return path
+ */
+	.globl fast_exception_return
+fast_exception_return:
+	wrteei	0
+1:	mr	r0,r13
+	ld	r10,_MSR(r1)
+	REST_4GPRS(2, r1)
+	andi.	r6,r10,MSR_PR
+	REST_2GPRS(6, r1)
+	beq	1f
+	ACCOUNT_CPU_USER_EXIT(r13, r10, r11)
+	ld	r0,GPR13(r1)
+
+1:	stdcx.	r0,0,r1		/* to clear the reservation */
+
+	ld	r8,_CCR(r1)
+	ld	r9,_LINK(r1)
+	ld	r10,_CTR(r1)
+	ld	r11,_XER(r1)
+	mtcr	r8
+	mtlr	r9
+	mtctr	r10
+	mtxer	r11
+	REST_2GPRS(8, r1)
+	ld	r10,GPR10(r1)
+	ld	r11,GPR11(r1)
+	ld	r12,GPR12(r1)
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r0
+
+	std	r10,PACA_EXGEN+EX_R10(r13);
+	std	r11,PACA_EXGEN+EX_R11(r13);
+	ld	r10,_NIP(r1)
+	ld	r11,_MSR(r1)
+	ld	r0,GPR0(r1)
+	ld	r1,GPR1(r1)
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	mfspr	r13,SPRN_SPRG_GEN_SCRATCH
+	rfi
+
+/*
+ * Trampolines used when spotting a bad kernel stack pointer in
+ * the exception entry code.
+ *
+ * TODO: move some bits like SRR0 read to trampoline, pass PACA
+ * index around, etc... to handle crit & mcheck
+ */
+BAD_STACK_TRAMPOLINE(0x000)
+BAD_STACK_TRAMPOLINE(0x100)
+BAD_STACK_TRAMPOLINE(0x200)
+BAD_STACK_TRAMPOLINE(0x220)
+BAD_STACK_TRAMPOLINE(0x260)
+BAD_STACK_TRAMPOLINE(0x280)
+BAD_STACK_TRAMPOLINE(0x2a0)
+BAD_STACK_TRAMPOLINE(0x2c0)
+BAD_STACK_TRAMPOLINE(0x2e0)
+BAD_STACK_TRAMPOLINE(0x300)
+BAD_STACK_TRAMPOLINE(0x310)
+BAD_STACK_TRAMPOLINE(0x320)
+BAD_STACK_TRAMPOLINE(0x340)
+BAD_STACK_TRAMPOLINE(0x400)
+BAD_STACK_TRAMPOLINE(0x500)
+BAD_STACK_TRAMPOLINE(0x600)
+BAD_STACK_TRAMPOLINE(0x700)
+BAD_STACK_TRAMPOLINE(0x800)
+BAD_STACK_TRAMPOLINE(0x900)
+BAD_STACK_TRAMPOLINE(0x980)
+BAD_STACK_TRAMPOLINE(0x9f0)
+BAD_STACK_TRAMPOLINE(0xa00)
+BAD_STACK_TRAMPOLINE(0xb00)
+BAD_STACK_TRAMPOLINE(0xc00)
+BAD_STACK_TRAMPOLINE(0xd00)
+BAD_STACK_TRAMPOLINE(0xd08)
+BAD_STACK_TRAMPOLINE(0xe00)
+BAD_STACK_TRAMPOLINE(0xf00)
+BAD_STACK_TRAMPOLINE(0xf20)
+
+	.globl	bad_stack_book3e
+bad_stack_book3e:
+	/* XXX: Needs to make SPRN_SPRG_GEN depend on exception type */
+	mfspr	r10,SPRN_SRR0;		  /* read SRR0 before touching stack */
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,64+INT_FRAME_SIZE
+	std	r10,_NIP(r1)
+	std	r11,_MSR(r1)
+	ld	r10,PACA_EXGEN+EX_R1(r13) /* FIXME for crit & mcheck */
+	lwz	r11,PACA_EXGEN+EX_CR(r13) /* FIXME for crit & mcheck */
+	std	r10,GPR1(r1)
+	std	r11,_CCR(r1)
+	mfspr	r10,SPRN_DEAR
+	mfspr	r11,SPRN_ESR
+	std	r10,_DAR(r1)
+	std	r11,_DSISR(r1)
+	std	r0,GPR0(r1);		/* save r0 in stackframe */	    \
+	std	r2,GPR2(r1);		/* save r2 in stackframe */	    \
+	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe */    \
+	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe */	    \
+	std	r9,GPR9(r1);		/* save r9 in stackframe */	    \
+	ld	r3,PACA_EXGEN+EX_R10(r13);/* get back r10 */		    \
+	ld	r4,PACA_EXGEN+EX_R11(r13);/* get back r11 */		    \
+	mfspr	r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 XXX can be wrong */ \
+	std	r3,GPR10(r1);		/* save r10 to stackframe */	    \
+	std	r4,GPR11(r1);		/* save r11 to stackframe */	    \
+	std	r12,GPR12(r1);		/* save r12 in stackframe */	    \
+	std	r5,GPR13(r1);		/* save it to stackframe */	    \
+	mflr	r10
+	mfctr	r11
+	mfxer	r12
+	std	r10,_LINK(r1)
+	std	r11,_CTR(r1)
+	std	r12,_XER(r1)
+	SAVE_10GPRS(14,r1)
+	SAVE_8GPRS(24,r1)
+	lhz	r12,PACA_TRAP_SAVE(r13)
+	std	r12,_TRAP(r1)
+	addi	r11,r1,INT_FRAME_SIZE
+	std	r11,0(r1)
+	li	r12,0
+	std	r12,0(r11)
+	ld	r2,PACATOC(r13)
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	kernel_bad_stack
+	b	1b
+
+/*
+ * Setup the initial TLB for a core. This current implementation
+ * assume that whatever we are running off will not conflict with
+ * the new mapping at PAGE_OFFSET.
+ */
+_GLOBAL(initial_tlb_book3e)
+
+	/* Look for the first TLB with IPROT set */
+	mfspr	r4,SPRN_TLB0CFG
+	andi.	r3,r4,TLBnCFG_IPROT
+	lis	r3,MAS0_TLBSEL(0)@h
+	bne	found_iprot
+
+	mfspr	r4,SPRN_TLB1CFG
+	andi.	r3,r4,TLBnCFG_IPROT
+	lis	r3,MAS0_TLBSEL(1)@h
+	bne	found_iprot
+
+	mfspr	r4,SPRN_TLB2CFG
+	andi.	r3,r4,TLBnCFG_IPROT
+	lis	r3,MAS0_TLBSEL(2)@h
+	bne	found_iprot
+
+	lis	r3,MAS0_TLBSEL(3)@h
+	mfspr	r4,SPRN_TLB3CFG
+	/* fall through */
+
+found_iprot:
+	andi.	r5,r4,TLBnCFG_HES
+	bne	have_hes
+
+	mflr	r8				/* save LR */
+/* 1. Find the index of the entry we're executing in
+ *
+ * r3 = MAS0_TLBSEL (for the iprot array)
+ * r4 = SPRN_TLBnCFG
+ */
+	bl	invstr				/* Find our address */
+invstr:	mflr	r6				/* Make it accessible */
+	mfmsr	r7
+	rlwinm	r5,r7,27,31,31			/* extract MSR[IS] */
+	mfspr	r7,SPRN_PID
+	slwi	r7,r7,16
+	or	r7,r7,r5
+	mtspr	SPRN_MAS6,r7
+	tlbsx	0,r6				/* search MSR[IS], SPID=PID */
+
+	mfspr	r3,SPRN_MAS0
+	rlwinm	r5,r3,16,20,31			/* Extract MAS0(Entry) */
+
+	mfspr	r7,SPRN_MAS1			/* Insure IPROT set */
+	oris	r7,r7,MAS1_IPROT@h
+	mtspr	SPRN_MAS1,r7
+	tlbwe
+
+/* 2. Invalidate all entries except the entry we're executing in
+ *
+ * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in
+ * r4 = SPRN_TLBnCFG
+ * r5 = ESEL of entry we are running in
+ */
+	andi.	r4,r4,TLBnCFG_N_ENTRY		/* Extract # entries */
+	li	r6,0				/* Set Entry counter to 0 */
+1:	mr	r7,r3				/* Set MAS0(TLBSEL) */
+	rlwimi	r7,r6,16,4,15			/* Setup MAS0 = TLBSEL | ESEL(r6) */
+	mtspr	SPRN_MAS0,r7
+	tlbre
+	mfspr	r7,SPRN_MAS1
+	rlwinm	r7,r7,0,2,31			/* Clear MAS1 Valid and IPROT */
+	cmpw	r5,r6
+	beq	skpinv				/* Dont update the current execution TLB */
+	mtspr	SPRN_MAS1,r7
+	tlbwe
+	isync
+skpinv:	addi	r6,r6,1				/* Increment */
+	cmpw	r6,r4				/* Are we done? */
+	bne	1b				/* If not, repeat */
+
+	/* Invalidate all TLBs */
+	PPC_TLBILX_ALL(0,R0)
+	sync
+	isync
+
+/* 3. Setup a temp mapping and jump to it
+ *
+ * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in
+ * r5 = ESEL of entry we are running in
+ */
+	andi.	r7,r5,0x1	/* Find an entry not used and is non-zero */
+	addi	r7,r7,0x1
+	mr	r4,r3		/* Set MAS0(TLBSEL) = 1 */
+	mtspr	SPRN_MAS0,r4
+	tlbre
+
+	rlwimi	r4,r7,16,4,15	/* Setup MAS0 = TLBSEL | ESEL(r7) */
+	mtspr	SPRN_MAS0,r4
+
+	mfspr	r7,SPRN_MAS1
+	xori	r6,r7,MAS1_TS		/* Setup TMP mapping in the other Address space */
+	mtspr	SPRN_MAS1,r6
+
+	tlbwe
+
+	mfmsr	r6
+	xori	r6,r6,MSR_IS
+	mtspr	SPRN_SRR1,r6
+	bl	1f		/* Find our address */
+1:	mflr	r6
+	addi	r6,r6,(2f - 1b)
+	mtspr	SPRN_SRR0,r6
+	rfi
+2:
+
+/* 4. Clear out PIDs & Search info
+ *
+ * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in
+ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping
+ * r5 = MAS3
+ */
+	li	r6,0
+	mtspr   SPRN_MAS6,r6
+	mtspr	SPRN_PID,r6
+
+/* 5. Invalidate mapping we started in
+ *
+ * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in
+ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping
+ * r5 = MAS3
+ */
+	mtspr	SPRN_MAS0,r3
+	tlbre
+	mfspr	r6,SPRN_MAS1
+	rlwinm	r6,r6,0,2,31	/* clear IPROT and VALID */
+	mtspr	SPRN_MAS1,r6
+	tlbwe
+	sync
+	isync
+
+/*
+ * The mapping only needs to be cache-coherent on SMP, except on
+ * Freescale e500mc derivatives where it's also needed for coherent DMA.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
+#define M_IF_NEEDED	MAS2_M
+#else
+#define M_IF_NEEDED	0
+#endif
+
+/* 6. Setup KERNELBASE mapping in TLB[0]
+ *
+ * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in
+ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping
+ * r5 = MAS3
+ */
+	rlwinm	r3,r3,0,16,3	/* clear ESEL */
+	mtspr	SPRN_MAS0,r3
+	lis	r6,(MAS1_VALID|MAS1_IPROT)@h
+	ori	r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l
+	mtspr	SPRN_MAS1,r6
+
+	LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET | M_IF_NEEDED)
+	mtspr	SPRN_MAS2,r6
+
+	rlwinm	r5,r5,0,0,25
+	ori	r5,r5,MAS3_SR | MAS3_SW | MAS3_SX
+	mtspr	SPRN_MAS3,r5
+	li	r5,-1
+	rlwinm	r5,r5,0,0,25
+
+	tlbwe
+
+/* 7. Jump to KERNELBASE mapping
+ *
+ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping
+ */
+	/* Now we branch the new virtual address mapped by this entry */
+	bl	1f		/* Find our address */
+1:	mflr	r6
+	addi	r6,r6,(2f - 1b)
+	tovirt(r6,r6)
+	lis	r7,MSR_KERNEL@h
+	ori	r7,r7,MSR_KERNEL@l
+	mtspr	SPRN_SRR0,r6
+	mtspr	SPRN_SRR1,r7
+	rfi				/* start execution out of TLB1[0] entry */
+2:
+
+/* 8. Clear out the temp mapping
+ *
+ * r4 = MAS0 w/TLBSEL & ESEL for the entry we are running in
+ */
+	mtspr	SPRN_MAS0,r4
+	tlbre
+	mfspr	r5,SPRN_MAS1
+	rlwinm	r5,r5,0,2,31	/* clear IPROT and VALID */
+	mtspr	SPRN_MAS1,r5
+	tlbwe
+	sync
+	isync
+
+	/* We translate LR and return */
+	tovirt(r8,r8)
+	mtlr	r8
+	blr
+
+have_hes:
+	/* Setup MAS 0,1,2,3 and 7 for tlbwe of a 1G entry that maps the
+	 * kernel linear mapping. We also set MAS8 once for all here though
+	 * that will have to be made dependent on whether we are running under
+	 * a hypervisor I suppose.
+	 */
+
+	/* BEWARE, MAGIC
+	 * This code is called as an ordinary function on the boot CPU. But to
+	 * avoid duplication, this code is also used in SCOM bringup of
+	 * secondary CPUs. We read the code between the initial_tlb_code_start
+	 * and initial_tlb_code_end labels one instruction at a time and RAM it
+	 * into the new core via SCOM. That doesn't process branches, so there
+	 * must be none between those two labels. It also means if this code
+	 * ever takes any parameters, the SCOM code must also be updated to
+	 * provide them.
+	 */
+	.globl a2_tlbinit_code_start
+a2_tlbinit_code_start:
+
+	ori	r11,r3,MAS0_WQ_ALLWAYS
+	oris	r11,r11,MAS0_ESEL(3)@h /* Use way 3: workaround A2 erratum 376 */
+	mtspr	SPRN_MAS0,r11
+	lis	r3,(MAS1_VALID | MAS1_IPROT)@h
+	ori	r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT
+	mtspr	SPRN_MAS1,r3
+	LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET | MAS2_M)
+	mtspr	SPRN_MAS2,r3
+	li	r3,MAS3_SR | MAS3_SW | MAS3_SX
+	mtspr	SPRN_MAS7_MAS3,r3
+	li	r3,0
+	mtspr	SPRN_MAS8,r3
+
+	/* Write the TLB entry */
+	tlbwe
+
+	.globl a2_tlbinit_after_linear_map
+a2_tlbinit_after_linear_map:
+
+	/* Now we branch the new virtual address mapped by this entry */
+	LOAD_REG_IMMEDIATE(r3,1f)
+	mtctr	r3
+	bctr
+
+1:	/* We are now running at PAGE_OFFSET, clean the TLB of everything
+	 * else (including IPROTed things left by firmware)
+	 * r4 = TLBnCFG
+	 * r3 = current address (more or less)
+	 */
+
+	li	r5,0
+	mtspr	SPRN_MAS6,r5
+	tlbsx	0,r3
+
+	rlwinm	r9,r4,0,TLBnCFG_N_ENTRY
+	rlwinm	r10,r4,8,0xff
+	addi	r10,r10,-1	/* Get inner loop mask */
+
+	li	r3,1
+
+	mfspr	r5,SPRN_MAS1
+	rlwinm	r5,r5,0,(~(MAS1_VALID|MAS1_IPROT))
+
+	mfspr	r6,SPRN_MAS2
+	rldicr	r6,r6,0,51		/* Extract EPN */
+
+	mfspr	r7,SPRN_MAS0
+	rlwinm	r7,r7,0,0xffff0fff	/* Clear HES and WQ */
+
+	rlwinm	r8,r7,16,0xfff		/* Extract ESEL */
+
+2:	add	r4,r3,r8
+	and	r4,r4,r10
+
+	rlwimi	r7,r4,16,MAS0_ESEL_MASK
+
+	mtspr	SPRN_MAS0,r7
+	mtspr	SPRN_MAS1,r5
+	mtspr	SPRN_MAS2,r6
+	tlbwe
+
+	addi	r3,r3,1
+	and.	r4,r3,r10
+
+	bne	3f
+	addis	r6,r6,(1<<30)@h
+3:
+	cmpw	r3,r9
+	blt	2b
+
+	.globl  a2_tlbinit_after_iprot_flush
+a2_tlbinit_after_iprot_flush:
+
+	PPC_TLBILX(0,0,R0)
+	sync
+	isync
+
+	.globl a2_tlbinit_code_end
+a2_tlbinit_code_end:
+
+	/* We translate LR and return */
+	mflr	r3
+	tovirt(r3,r3)
+	mtlr	r3
+	blr
+
+/*
+ * Main entry (boot CPU, thread 0)
+ *
+ * We enter here from head_64.S, possibly after the prom_init trampoline
+ * with r3 and r4 already saved to r31 and 30 respectively and in 64 bits
+ * mode. Anything else is as it was left by the bootloader
+ *
+ * Initial requirements of this port:
+ *
+ * - Kernel loaded at 0 physical
+ * - A good lump of memory mapped 0:0 by UTLB entry 0
+ * - MSR:IS & MSR:DS set to 0
+ *
+ * Note that some of the above requirements will be relaxed in the future
+ * as the kernel becomes smarter at dealing with different initial conditions
+ * but for now you have to be careful
+ */
+_GLOBAL(start_initialization_book3e)
+	mflr	r28
+
+	/* First, we need to setup some initial TLBs to map the kernel
+	 * text, data and bss at PAGE_OFFSET. We don't have a real mode
+	 * and always use AS 0, so we just set it up to match our link
+	 * address and never use 0 based addresses.
+	 */
+	bl	initial_tlb_book3e
+
+	/* Init global core bits */
+	bl	init_core_book3e
+
+	/* Init per-thread bits */
+	bl	init_thread_book3e
+
+	/* Return to common init code */
+	tovirt(r28,r28)
+	mtlr	r28
+	blr
+
+
+/*
+ * Secondary core/processor entry
+ *
+ * This is entered for thread 0 of a secondary core, all other threads
+ * are expected to be stopped. It's similar to start_initialization_book3e
+ * except that it's generally entered from the holding loop in head_64.S
+ * after CPUs have been gathered by Open Firmware.
+ *
+ * We assume we are in 32 bits mode running with whatever TLB entry was
+ * set for us by the firmware or POR engine.
+ */
+_GLOBAL(book3e_secondary_core_init_tlb_set)
+	li	r4,1
+	b	generic_secondary_smp_init
+
+_GLOBAL(book3e_secondary_core_init)
+	mflr	r28
+
+	/* Do we need to setup initial TLB entry ? */
+	cmplwi	r4,0
+	bne	2f
+
+	/* Setup TLB for this core */
+	bl	initial_tlb_book3e
+
+	/* We can return from the above running at a different
+	 * address, so recalculate r2 (TOC)
+	 */
+	bl	relative_toc
+
+	/* Init global core bits */
+2:	bl	init_core_book3e
+
+	/* Init per-thread bits */
+3:	bl	init_thread_book3e
+
+	/* Return to common init code at proper virtual address.
+	 *
+	 * Due to various previous assumptions, we know we entered this
+	 * function at either the final PAGE_OFFSET mapping or using a
+	 * 1:1 mapping at 0, so we don't bother doing a complicated check
+	 * here, we just ensure the return address has the right top bits.
+	 *
+	 * Note that if we ever want to be smarter about where we can be
+	 * started from, we have to be careful that by the time we reach
+	 * the code below we may already be running at a different location
+	 * than the one we were called from since initial_tlb_book3e can
+	 * have moved us already.
+	 */
+	cmpdi	cr0,r28,0
+	blt	1f
+	lis	r3,PAGE_OFFSET@highest
+	sldi	r3,r3,32
+	or	r28,r28,r3
+1:	mtlr	r28
+	blr
+
+_GLOBAL(book3e_secondary_thread_init)
+	mflr	r28
+	b	3b
+
+	.globl init_core_book3e
+init_core_book3e:
+	/* Establish the interrupt vector base */
+	tovirt(r2,r2)
+	LOAD_REG_ADDR(r3, interrupt_base_book3e)
+	mtspr	SPRN_IVPR,r3
+	sync
+	blr
+
+init_thread_book3e:
+	lis	r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h
+	mtspr	SPRN_EPCR,r3
+
+	/* Make sure interrupts are off */
+	wrteei	0
+
+	/* disable all timers and clear out status */
+	li	r3,0
+	mtspr	SPRN_TCR,r3
+	mfspr	r3,SPRN_TSR
+	mtspr	SPRN_TSR,r3
+
+	blr
+
+_GLOBAL(__setup_base_ivors)
+	SET_IVOR(0, 0x020) /* Critical Input */
+	SET_IVOR(1, 0x000) /* Machine Check */
+	SET_IVOR(2, 0x060) /* Data Storage */ 
+	SET_IVOR(3, 0x080) /* Instruction Storage */
+	SET_IVOR(4, 0x0a0) /* External Input */ 
+	SET_IVOR(5, 0x0c0) /* Alignment */ 
+	SET_IVOR(6, 0x0e0) /* Program */ 
+	SET_IVOR(7, 0x100) /* FP Unavailable */ 
+	SET_IVOR(8, 0x120) /* System Call */ 
+	SET_IVOR(9, 0x140) /* Auxiliary Processor Unavailable */ 
+	SET_IVOR(10, 0x160) /* Decrementer */ 
+	SET_IVOR(11, 0x180) /* Fixed Interval Timer */ 
+	SET_IVOR(12, 0x1a0) /* Watchdog Timer */ 
+	SET_IVOR(13, 0x1c0) /* Data TLB Error */ 
+	SET_IVOR(14, 0x1e0) /* Instruction TLB Error */
+	SET_IVOR(15, 0x040) /* Debug */
+
+	sync
+
+	blr
+
+_GLOBAL(setup_altivec_ivors)
+	SET_IVOR(32, 0x200) /* AltiVec Unavailable */
+	SET_IVOR(33, 0x220) /* AltiVec Assist */
+	blr
+
+_GLOBAL(setup_perfmon_ivor)
+	SET_IVOR(35, 0x260) /* Performance Monitor */
+	blr
+
+_GLOBAL(setup_doorbell_ivors)
+	SET_IVOR(36, 0x280) /* Processor Doorbell */
+	SET_IVOR(37, 0x2a0) /* Processor Doorbell Crit */
+	blr
+
+_GLOBAL(setup_ehv_ivors)
+	SET_IVOR(40, 0x300) /* Embedded Hypervisor System Call */
+	SET_IVOR(41, 0x320) /* Embedded Hypervisor Privilege */
+	SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */
+	SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */
+	blr
+
+_GLOBAL(setup_lrat_ivor)
+	SET_IVOR(42, 0x340) /* LRAT Error */
+	blr
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
new file mode 100644
index 000000000..344e2758b
--- /dev/null
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -0,0 +1,1936 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file contains the 64-bit "server" PowerPC variant
+ * of the low level exception handling including exception
+ * vectors, exception return, part of the slb and stab
+ * handling and other fixed offset specific things.
+ *
+ * This file is meant to be #included from head_64.S due to
+ * position dependent assembly.
+ *
+ * Most of this originates from head_64.S and thus has the same
+ * copyright history.
+ *
+ */
+
+#include <asm/hw_irq.h>
+#include <asm/exception-64s.h>
+#include <asm/ptrace.h>
+#include <asm/cpuidle.h>
+#include <asm/head-64.h>
+#include <asm/feature-fixups.h>
+
+/*
+ * There are a few constraints to be concerned with.
+ * - Real mode exceptions code/data must be located at their physical location.
+ * - Virtual mode exceptions must be mapped at their 0xc000... location.
+ * - Fixed location code must not call directly beyond the __end_interrupts
+ *   area when built with CONFIG_RELOCATABLE. LOAD_HANDLER / bctr sequence
+ *   must be used.
+ * - LOAD_HANDLER targets must be within first 64K of physical 0 /
+ *   virtual 0xc00...
+ * - Conditional branch targets must be within +/-32K of caller.
+ *
+ * "Virtual exceptions" run with relocation on (MSR_IR=1, MSR_DR=1), and
+ * therefore don't have to run in physically located code or rfid to
+ * virtual mode kernel code. However on relocatable kernels they do have
+ * to branch to KERNELBASE offset because the rest of the kernel (outside
+ * the exception vectors) may be located elsewhere.
+ *
+ * Virtual exceptions correspond with physical, except their entry points
+ * are offset by 0xc000000000000000 and also tend to get an added 0x4000
+ * offset applied. Virtual exceptions are enabled with the Alternate
+ * Interrupt Location (AIL) bit set in the LPCR. However this does not
+ * guarantee they will be delivered virtually. Some conditions (see the ISA)
+ * cause exceptions to be delivered in real mode.
+ *
+ * It's impossible to receive interrupts below 0x300 via AIL.
+ *
+ * KVM: None of the virtual exceptions are from the guest. Anything that
+ * escalated to HV=1 from HV=0 is delivered via real mode handlers.
+ *
+ *
+ * We layout physical memory as follows:
+ * 0x0000 - 0x00ff : Secondary processor spin code
+ * 0x0100 - 0x18ff : Real mode pSeries interrupt vectors
+ * 0x1900 - 0x3fff : Real mode trampolines
+ * 0x4000 - 0x58ff : Relon (IR=1,DR=1) mode pSeries interrupt vectors
+ * 0x5900 - 0x6fff : Relon mode trampolines
+ * 0x7000 - 0x7fff : FWNMI data area
+ * 0x8000 -   .... : Common interrupt handlers, remaining early
+ *                   setup code, rest of kernel.
+ *
+ * We could reclaim 0x4000-0x42ff for real mode trampolines if the space
+ * is necessary. Until then it's more consistent to explicitly put VIRT_NONE
+ * vectors there.
+ */
+OPEN_FIXED_SECTION(real_vectors,        0x0100, 0x1900)
+OPEN_FIXED_SECTION(real_trampolines,    0x1900, 0x4000)
+OPEN_FIXED_SECTION(virt_vectors,        0x4000, 0x5900)
+OPEN_FIXED_SECTION(virt_trampolines,    0x5900, 0x7000)
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+/*
+ * Data area reserved for FWNMI option.
+ * This address (0x7000) is fixed by the RPA.
+ * pseries and powernv need to keep the whole page from
+ * 0x7000 to 0x8000 free for use by the firmware
+ */
+ZERO_FIXED_SECTION(fwnmi_page,          0x7000, 0x8000)
+OPEN_TEXT_SECTION(0x8000)
+#else
+OPEN_TEXT_SECTION(0x7000)
+#endif
+
+USE_FIXED_SECTION(real_vectors)
+
+/*
+ * This is the start of the interrupt handlers for pSeries
+ * This code runs with relocation off.
+ * Code from here to __end_interrupts gets copied down to real
+ * address 0x100 when we are running a relocatable kernel.
+ * Therefore any relative branches in this section must only
+ * branch to labels in this section.
+ */
+	.globl __start_interrupts
+__start_interrupts:
+
+/* No virt vectors corresponding with 0x0..0x100 */
+EXC_VIRT_NONE(0x4000, 0x100)
+
+
+#ifdef CONFIG_PPC_P7_NAP
+	/*
+	 * If running native on arch 2.06 or later, check if we are waking up
+	 * from nap/sleep/winkle, and branch to idle handler. This tests SRR1
+	 * bits 46:47. A non-0 value indicates that we are coming from a power
+	 * saving state. The idle wakeup handler initially runs in real mode,
+	 * but we branch to the 0xc000... address so we can turn on relocation
+	 * with mtmsr.
+	 */
+#define IDLETEST(n)							\
+	BEGIN_FTR_SECTION ;						\
+	mfspr	r10,SPRN_SRR1 ;						\
+	rlwinm.	r10,r10,47-31,30,31 ;					\
+	beq-	1f ;							\
+	cmpwi	cr3,r10,2 ;						\
+	BRANCH_TO_C000(r10, system_reset_idle_common) ;			\
+1:									\
+	KVMTEST_PR(n) ;							\
+	END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#else
+#define IDLETEST NOTEST
+#endif
+
+EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
+	SET_SCRATCH0(r13)
+	/*
+	 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
+	 * being used, so a nested NMI exception would corrupt it.
+	 */
+	EXCEPTION_PROLOG_NORI(PACA_EXNMI, system_reset_common, EXC_STD,
+			      IDLETEST, 0x100)
+
+EXC_REAL_END(system_reset, 0x100, 0x100)
+EXC_VIRT_NONE(0x4100, 0x100)
+TRAMP_KVM(PACA_EXNMI, 0x100)
+
+#ifdef CONFIG_PPC_P7_NAP
+EXC_COMMON_BEGIN(system_reset_idle_common)
+	mfspr	r12,SPRN_SRR1
+	b	pnv_powersave_wakeup
+#endif
+
+/*
+ * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does
+ * the right thing. We do not want to reconcile because that goes
+ * through irq tracing which we don't want in NMI.
+ *
+ * Save PACAIRQHAPPENED because some code will do a hard disable
+ * (e.g., xmon). So we want to restore this back to where it was
+ * when we return. DAR is unused in the stack, so save it there.
+ */
+#define ADD_RECONCILE_NMI						\
+	li	r10,IRQS_ALL_DISABLED;					\
+	stb	r10,PACAIRQSOFTMASK(r13);				\
+	lbz	r10,PACAIRQHAPPENED(r13);				\
+	std	r10,_DAR(r1)
+
+EXC_COMMON_BEGIN(system_reset_common)
+	/*
+	 * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able
+	 * to recover, but nested NMI will notice in_nmi and not recover
+	 * because of the use of the NMI stack. in_nmi reentrancy is tested in
+	 * system_reset_exception.
+	 */
+	lhz	r10,PACA_IN_NMI(r13)
+	addi	r10,r10,1
+	sth	r10,PACA_IN_NMI(r13)
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
+
+	mr	r10,r1
+	ld	r1,PACA_NMI_EMERG_SP(r13)
+	subi	r1,r1,INT_FRAME_SIZE
+	EXCEPTION_COMMON_NORET_STACK(PACA_EXNMI, 0x100,
+			system_reset, system_reset_exception,
+			ADD_NVGPRS;ADD_RECONCILE_NMI)
+
+	/* This (and MCE) can be simplified with mtmsrd L=1 */
+	/* Clear MSR_RI before setting SRR0 and SRR1. */
+	li	r0,MSR_RI
+	mfmsr	r9
+	andc	r9,r9,r0
+	mtmsrd	r9,1
+
+	/*
+	 * MSR_RI is clear, now we can decrement paca->in_nmi.
+	 */
+	lhz	r10,PACA_IN_NMI(r13)
+	subi	r10,r10,1
+	sth	r10,PACA_IN_NMI(r13)
+
+	/*
+	 * Restore soft mask settings.
+	 */
+	ld	r10,_DAR(r1)
+	stb	r10,PACAIRQHAPPENED(r13)
+	ld	r10,SOFTE(r1)
+	stb	r10,PACAIRQSOFTMASK(r13)
+
+	/*
+	 * Keep below code in synch with MACHINE_CHECK_HANDLER_WINDUP.
+	 * Should share common bits...
+	 */
+
+	/* Move original SRR0 and SRR1 into the respective regs */
+	ld	r9,_MSR(r1)
+	mtspr	SPRN_SRR1,r9
+	ld	r3,_NIP(r1)
+	mtspr	SPRN_SRR0,r3
+	ld	r9,_CTR(r1)
+	mtctr	r9
+	ld	r9,_XER(r1)
+	mtxer	r9
+	ld	r9,_LINK(r1)
+	mtlr	r9
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+	REST_GPR(10, r1)
+	ld	r11,_CCR(r1)
+	mtcr	r11
+	REST_GPR(11, r1)
+	REST_2GPRS(12, r1)
+	/* restore original r1. */
+	ld	r1,GPR1(r1)
+	RFI_TO_USER_OR_KERNEL
+
+#ifdef CONFIG_PPC_PSERIES
+/*
+ * Vectors for the FWNMI option.  Share common code.
+ */
+TRAMP_REAL_BEGIN(system_reset_fwnmi)
+	SET_SCRATCH0(r13)		/* save r13 */
+	/* See comment at system_reset exception */
+	EXCEPTION_PROLOG_NORI(PACA_EXNMI, system_reset_common, EXC_STD,
+			      NOTEST, 0x100)
+#endif /* CONFIG_PPC_PSERIES */
+
+
+EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
+	/* This is moved out of line as it can be patched by FW, but
+	 * some code path might still want to branch into the original
+	 * vector
+	 */
+	SET_SCRATCH0(r13)		/* save r13 */
+	EXCEPTION_PROLOG_0(PACA_EXMC)
+BEGIN_FTR_SECTION
+	b	machine_check_powernv_early
+FTR_SECTION_ELSE
+	b	machine_check_pSeries_0
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+EXC_REAL_END(machine_check, 0x200, 0x100)
+EXC_VIRT_NONE(0x4200, 0x100)
+TRAMP_REAL_BEGIN(machine_check_powernv_early)
+BEGIN_FTR_SECTION
+	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
+	/*
+	 * Register contents:
+	 * R13		= PACA
+	 * R9		= CR
+	 * Original R9 to R13 is saved on PACA_EXMC
+	 *
+	 * Switch to mc_emergency stack and handle re-entrancy (we limit
+	 * the nested MCE upto level 4 to avoid stack overflow).
+	 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
+	 *
+	 * We use paca->in_mce to check whether this is the first entry or
+	 * nested machine check. We increment paca->in_mce to track nested
+	 * machine checks.
+	 *
+	 * If this is the first entry then set stack pointer to
+	 * paca->mc_emergency_sp, otherwise r1 is already pointing to
+	 * stack frame on mc_emergency stack.
+	 *
+	 * NOTE: We are here with MSR_ME=0 (off), which means we risk a
+	 * checkstop if we get another machine check exception before we do
+	 * rfid with MSR_ME=1.
+	 *
+	 * This interrupt can wake directly from idle. If that is the case,
+	 * the machine check is handled then the idle wakeup code is called
+	 * to restore state.
+	 */
+	mr	r11,r1			/* Save r1 */
+	lhz	r10,PACA_IN_MCE(r13)
+	cmpwi	r10,0			/* Are we in nested machine check */
+	bne	0f			/* Yes, we are. */
+	/* First machine check entry */
+	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
+0:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
+	addi	r10,r10,1		/* increment paca->in_mce */
+	sth	r10,PACA_IN_MCE(r13)
+	/* Limit nested MCE to level 4 to avoid stack overflow */
+	cmpwi	r10,MAX_MCE_DEPTH
+	bgt	2f			/* Check if we hit limit of 4 */
+	std	r11,GPR1(r1)		/* Save r1 on the stack. */
+	std	r11,0(r1)		/* make stack chain pointer */
+	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
+	std	r11,_NIP(r1)
+	mfspr	r11,SPRN_SRR1		/* Save SRR1 */
+	std	r11,_MSR(r1)
+	mfspr	r11,SPRN_DAR		/* Save DAR */
+	std	r11,_DAR(r1)
+	mfspr	r11,SPRN_DSISR		/* Save DSISR */
+	std	r11,_DSISR(r1)
+	std	r9,_CCR(r1)		/* Save CR in stackframe */
+	/* Save r9 through r13 from EXMC save area to stack frame. */
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+	mfmsr	r11			/* get MSR value */
+	ori	r11,r11,MSR_ME		/* turn on ME bit */
+	ori	r11,r11,MSR_RI		/* turn on RI bit */
+	LOAD_HANDLER(r12, machine_check_handle_early)
+1:	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r11
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+2:
+	/* Stack overflow. Stay on emergency stack and panic.
+	 * Keep the ME bit off while panic-ing, so that if we hit
+	 * another machine check we checkstop.
+	 */
+	addi	r1,r1,INT_FRAME_SIZE	/* go back to previous stack frame */
+	ld	r11,PACAKMSR(r13)
+	LOAD_HANDLER(r12, unrecover_mce)
+	li	r10,MSR_ME
+	andc	r11,r11,r10		/* Turn off MSR_ME */
+	b	1b
+	b	.	/* prevent speculative execution */
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+
+TRAMP_REAL_BEGIN(machine_check_pSeries)
+	.globl machine_check_fwnmi
+machine_check_fwnmi:
+	SET_SCRATCH0(r13)		/* save r13 */
+	EXCEPTION_PROLOG_0(PACA_EXMC)
+machine_check_pSeries_0:
+	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
+	/*
+	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+	 * nested machine check corrupts it. machine_check_common enables
+	 * MSR_RI.
+	 */
+	EXCEPTION_PROLOG_2_NORI(machine_check_common, EXC_STD)
+
+TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
+
+EXC_COMMON_BEGIN(machine_check_common)
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXMC+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXMC+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
+	FINISH_NAP
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r3,PACA_EXMC+EX_DAR(r13)
+	lwz	r4,PACA_EXMC+EX_DSISR(r13)
+	/* Enable MSR_RI when finished with PACA_EXMC */
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	b	ret_from_except
+
+#define MACHINE_CHECK_HANDLER_WINDUP			\
+	/* Clear MSR_RI before setting SRR0 and SRR1. */\
+	li	r0,MSR_RI;				\
+	mfmsr	r9;		/* get MSR value */	\
+	andc	r9,r9,r0;				\
+	mtmsrd	r9,1;		/* Clear MSR_RI */	\
+	/* Move original SRR0 and SRR1 into the respective regs */	\
+	ld	r9,_MSR(r1);				\
+	mtspr	SPRN_SRR1,r9;				\
+	ld	r3,_NIP(r1);				\
+	mtspr	SPRN_SRR0,r3;				\
+	ld	r9,_CTR(r1);				\
+	mtctr	r9;					\
+	ld	r9,_XER(r1);				\
+	mtxer	r9;					\
+	ld	r9,_LINK(r1);				\
+	mtlr	r9;					\
+	REST_GPR(0, r1);				\
+	REST_8GPRS(2, r1);				\
+	REST_GPR(10, r1);				\
+	ld	r11,_CCR(r1);				\
+	mtcr	r11;					\
+	/* Decrement paca->in_mce. */			\
+	lhz	r12,PACA_IN_MCE(r13);			\
+	subi	r12,r12,1;				\
+	sth	r12,PACA_IN_MCE(r13);			\
+	REST_GPR(11, r1);				\
+	REST_2GPRS(12, r1);				\
+	/* restore original r1. */			\
+	ld	r1,GPR1(r1)
+
+#ifdef CONFIG_PPC_P7_NAP
+/*
+ * This is an idle wakeup. Low level machine check has already been
+ * done. Queue the event then call the idle code to do the wake up.
+ */
+EXC_COMMON_BEGIN(machine_check_idle_common)
+	bl	machine_check_queue_event
+
+	/*
+	 * We have not used any non-volatile GPRs here, and as a rule
+	 * most exception code including machine check does not.
+	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
+	 * wakeup will restore volatile registers.
+	 *
+	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+	 *
+	 * Then decrement MCE nesting after finishing with the stack.
+	 */
+	ld	r3,_MSR(r1)
+
+	lhz	r11,PACA_IN_MCE(r13)
+	subi	r11,r11,1
+	sth	r11,PACA_IN_MCE(r13)
+
+	/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
+	/* Recoverability could be improved by reducing the use of SRR1. */
+	li	r11,0
+	mtmsrd	r11,1
+
+	b	pnv_powersave_wakeup_mce
+#endif
+	/*
+	 * Handle machine check early in real mode. We come here with
+	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
+	 */
+EXC_COMMON_BEGIN(machine_check_handle_early)
+	std	r0,GPR0(r1)	/* Save r0 */
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_early
+	std	r3,RESULT(r1)	/* Save result */
+	ld	r12,_MSR(r1)
+
+#ifdef	CONFIG_PPC_P7_NAP
+	/*
+	 * Check if thread was in power saving mode. We come here when any
+	 * of the following is true:
+	 * a. thread wasn't in power saving mode
+	 * b. thread was in power saving mode with no state loss,
+	 *    supervisor state loss or hypervisor state loss.
+	 *
+	 * Go back to nap/sleep/winkle mode again if (b) is true.
+	 */
+	BEGIN_FTR_SECTION
+	rlwinm.	r11,r12,47-31,30,31
+	bne	machine_check_idle_common
+	END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#endif
+
+	/*
+	 * Check if we are coming from hypervisor userspace. If yes then we
+	 * continue in host kernel in V mode to deliver the MC event.
+	 */
+	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
+	beq	5f
+	andi.	r11,r12,MSR_PR		/* See if coming from user. */
+	bne	9f			/* continue in V mode if we are. */
+
+5:
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	/*
+	 * We are coming from kernel context. Check if we are coming from
+	 * guest. if yes, then we can continue. We will fall through
+	 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
+	 */
+	lbz	r11,HSTATE_IN_GUEST(r13)
+	cmpwi	r11,0			/* Check if coming from guest */
+	bne	9f			/* continue if we are. */
+#endif
+	/*
+	 * At this point we are not sure about what context we come from.
+	 * Queue up the MCE event and return from the interrupt.
+	 * But before that, check if this is an un-recoverable exception.
+	 * If yes, then stay on emergency stack and panic.
+	 */
+	andi.	r11,r12,MSR_RI
+	bne	2f
+1:	mfspr	r11,SPRN_SRR0
+	LOAD_HANDLER(r10,unrecover_mce)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	/*
+	 * We are going down. But there are chances that we might get hit by
+	 * another MCE during panic path and we may run into unstable state
+	 * with no way out. Hence, turn ME bit off while going down, so that
+	 * when another MCE is hit during panic path, system will checkstop
+	 * and hypervisor will get restarted cleanly by SP.
+	 */
+	li	r3,MSR_ME
+	andc	r10,r10,r3		/* Turn off MSR_ME */
+	mtspr	SPRN_SRR1,r10
+	RFI_TO_KERNEL
+	b	.
+2:
+	/*
+	 * Check if we have successfully handled/recovered from error, if not
+	 * then stay on emergency stack and panic.
+	 */
+	ld	r3,RESULT(r1)	/* Load result */
+	cmpdi	r3,0		/* see if we handled MCE successfully */
+
+	beq	1b		/* if !handled then panic */
+	/*
+	 * Return from MC interrupt.
+	 * Queue up the MCE event so that we can log it later, while
+	 * returning from kernel or opal call.
+	 */
+	bl	machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	RFI_TO_USER_OR_KERNEL
+9:
+	/* Deliver the machine check to host kernel in V mode. */
+BEGIN_FTR_SECTION
+	ld	r10,ORIG_GPR3(r1)
+	mtspr	SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+	MACHINE_CHECK_HANDLER_WINDUP
+	b	machine_check_pSeries
+
+EXC_COMMON_BEGIN(unrecover_mce)
+	/* Invoke machine_check_exception to print MCE event and panic. */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	/*
+	 * We will not reach here. Even if we did, there is no way out. Call
+	 * unrecoverable_exception and die.
+	 */
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unrecoverable_exception
+	b	1b
+
+
+EXC_REAL_OOL(data_access, 0x300, 0x80)
+EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
+TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
+
+EXC_COMMON_BEGIN(data_access_common)
+	/*
+	 * Here r13 points to the paca, r9 contains the saved CR,
+	 * SRR0 and SRR1 are saved in r11 and r12,
+	 * r9 - r13 are saved in paca->exgen.
+	 */
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r12,_MSR(r1)
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	li	r5,0x300
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
+	b	do_hash_page		/* Try to handle as hpte fault */
+MMU_FTR_SECTION_ELSE
+	b	handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+
+
+EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	b tramp_data_access_slb
+EXC_REAL_END(data_access_slb, 0x380, 0x80)
+
+TRAMP_REAL_BEGIN(tramp_data_access_slb)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+	mr	r12,r3	/* save r3 */
+	mfspr	r3,SPRN_DAR
+	mfspr	r11,SPRN_SRR1
+	crset	4*cr6+eq
+	BRANCH_TO_COMMON(r10, slb_miss_common)
+
+EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
+	mr	r12,r3	/* save r3 */
+	mfspr	r3,SPRN_DAR
+	mfspr	r11,SPRN_SRR1
+	crset	4*cr6+eq
+	BRANCH_TO_COMMON(r10, slb_miss_common)
+EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
+TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
+
+
+EXC_REAL_OOL(instruction_access, 0x400, 0x80)
+EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400)
+TRAMP_KVM(PACA_EXGEN, 0x400)
+
+EXC_COMMON_BEGIN(instruction_access_common)
+	EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r12,_MSR(r1)
+	ld	r3,_NIP(r1)
+	andis.	r4,r12,DSISR_SRR1_MATCH_64S@h
+	li	r5,0x400
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
+	b	do_hash_page		/* Try to handle as hpte fault */
+MMU_FTR_SECTION_ELSE
+	b	handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+
+
+EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	b tramp_instruction_access_slb
+EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
+
+TRAMP_REAL_BEGIN(tramp_instruction_access_slb)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
+	mr	r12,r3	/* save r3 */
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r11,SPRN_SRR1
+	crclr	4*cr6+eq
+	BRANCH_TO_COMMON(r10, slb_miss_common)
+
+EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
+	mr	r12,r3	/* save r3 */
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r11,SPRN_SRR1
+	crclr	4*cr6+eq
+	BRANCH_TO_COMMON(r10, slb_miss_common)
+EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
+TRAMP_KVM(PACA_EXSLB, 0x480)
+
+
+/*
+ * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as
+ * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled.
+ */
+EXC_COMMON_BEGIN(slb_miss_common)
+	/*
+	 * r13 points to the PACA, r9 contains the saved CR,
+	 * r12 contains the saved r3,
+	 * r11 contain the saved SRR1, SRR0 is still ready for return
+	 * r3 has the faulting address
+	 * r9 - r13 are saved in paca->exslb.
+ 	 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss
+	 * We assume we aren't going to take any exceptions during this
+	 * procedure.
+	 */
+	mflr	r10
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
+
+	andi.	r9,r11,MSR_PR	// Check for exception from userspace
+	cmpdi	cr4,r9,MSR_PR	// And save the result in CR4 for later
+
+	/*
+	 * Test MSR_RI before calling slb_allocate_realmode, because the
+	 * MSR in r11 gets clobbered. However we still want to allocate
+	 * SLB in case MSR_RI=0, to minimise the risk of getting stuck in
+	 * recursive SLB faults. So use cr5 for this, which is preserved.
+	 */
+	andi.	r11,r11,MSR_RI	/* check for unrecoverable exception */
+	cmpdi	cr5,r11,MSR_RI
+
+	crset	4*cr0+eq
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_MMU_FTR_SECTION
+	bl	slb_allocate
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
+#endif
+
+	ld	r10,PACA_EXSLB+EX_LR(r13)
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+	mtlr	r10
+
+	/*
+	 * Large address, check whether we have to allocate new contexts.
+	 */
+	beq-	8f
+
+	bne-	cr5,2f		/* if unrecoverable exception, oops */
+
+	/* All done -- return from exception. */
+
+	bne	cr4,1f		/* returning to kernel */
+
+	mtcrf	0x80,r9
+	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */
+	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
+	mtcrf	0x02,r9		/* I/D indication is in cr6 */
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+
+	RESTORE_CTR(r9, PACA_EXSLB)
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	mr	r3,r12
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+1:
+	mtcrf	0x80,r9
+	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */
+	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
+	mtcrf	0x02,r9		/* I/D indication is in cr6 */
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+
+	RESTORE_CTR(r9, PACA_EXSLB)
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	mr	r3,r12
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+
+
+2:	std     r3,PACA_EXSLB+EX_DAR(r13)
+	mr	r3,r12
+	mfspr	r11,SPRN_SRR0
+	mfspr	r12,SPRN_SRR1
+	LOAD_HANDLER(r10,unrecov_slb)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	mtspr	SPRN_SRR1,r10
+	RFI_TO_KERNEL
+	b	.
+
+8:	std     r3,PACA_EXSLB+EX_DAR(r13)
+	mr	r3,r12
+	mfspr	r11,SPRN_SRR0
+	mfspr	r12,SPRN_SRR1
+	LOAD_HANDLER(r10, large_addr_slb)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	mtspr	SPRN_SRR1,r10
+	RFI_TO_KERNEL
+	b	.
+
+EXC_COMMON_BEGIN(unrecov_slb)
+	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
+	RECONCILE_IRQ_STATE(r10, r11)
+	bl	save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unrecoverable_exception
+	b	1b
+
+EXC_COMMON_BEGIN(large_addr_slb)
+	EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB)
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r3, PACA_EXSLB+EX_DAR(r13)
+	std	r3, _DAR(r1)
+	beq	cr6, 2f
+	li	r10, 0x481		/* fix trap number for I-SLB miss */
+	std	r10, _TRAP(r1)
+2:	bl	save_nvgprs
+	addi	r3, r1, STACK_FRAME_OVERHEAD
+	bl	slb_miss_large_addr
+	b	ret_from_except
+
+EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
+	.globl hardware_interrupt_hv;
+hardware_interrupt_hv:
+	BEGIN_FTR_SECTION
+		MASKABLE_EXCEPTION_HV(0x500, hardware_interrupt_common, IRQS_DISABLED)
+	FTR_SECTION_ELSE
+		MASKABLE_EXCEPTION(0x500, hardware_interrupt_common, IRQS_DISABLED)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
+
+EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
+	.globl hardware_interrupt_relon_hv;
+hardware_interrupt_relon_hv:
+	BEGIN_FTR_SECTION
+		MASKABLE_RELON_EXCEPTION_HV(0x500, hardware_interrupt_common,
+					    IRQS_DISABLED)
+	FTR_SECTION_ELSE
+		__MASKABLE_RELON_EXCEPTION(0x500, hardware_interrupt_common,
+					   EXC_STD, SOFTEN_TEST_PR, IRQS_DISABLED)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
+
+TRAMP_KVM(PACA_EXGEN, 0x500)
+TRAMP_KVM_HV(PACA_EXGEN, 0x500)
+EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
+
+
+EXC_REAL(alignment, 0x600, 0x100)
+EXC_VIRT(alignment, 0x4600, 0x100, 0x600)
+TRAMP_KVM(PACA_EXGEN, 0x600)
+EXC_COMMON_BEGIN(alignment_common)
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN)
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	alignment_exception
+	b	ret_from_except
+
+
+EXC_REAL(program_check, 0x700, 0x100)
+EXC_VIRT(program_check, 0x4700, 0x100, 0x700)
+TRAMP_KVM(PACA_EXGEN, 0x700)
+EXC_COMMON_BEGIN(program_check_common)
+	/*
+	 * It's possible to receive a TM Bad Thing type program check with
+	 * userspace register values (in particular r1), but with SRR1 reporting
+	 * that we came from the kernel. Normally that would confuse the bad
+	 * stack logic, and we would report a bad kernel stack pointer. Instead
+	 * we switch to the emergency stack if we're taking a TM Bad Thing from
+	 * the kernel.
+	 */
+	li	r10,MSR_PR		/* Build a mask of MSR_PR ..	*/
+	oris	r10,r10,0x200000@h	/* .. and SRR1_PROGTM		*/
+	and	r10,r10,r12		/* Mask SRR1 with that.		*/
+	srdi	r10,r10,8		/* Shift it so we can compare	*/
+	cmpldi	r10,(0x200000 >> 8)	/* .. with an immediate.	*/
+	bne 1f				/* If != go to normal path.	*/
+
+	/* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack	*/
+	andi.	r10,r12,MSR_PR;		/* Set CR0 correctly for label	*/
+					/* 3 in EXCEPTION_PROLOG_COMMON	*/
+	mr	r10,r1			/* Save r1			*/
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
+	b 3f				/* Jump into the macro !!	*/
+1:	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	program_check_exception
+	b	ret_from_except
+
+
+EXC_REAL(fp_unavailable, 0x800, 0x100)
+EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800)
+TRAMP_KVM(PACA_EXGEN, 0x800)
+EXC_COMMON_BEGIN(fp_unavailable_common)
+	EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
+	bne	1f			/* if from user, just load it up */
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	kernel_fp_unavailable_exception
+	BUG_OPCODE
+1:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	/* Test if 2 TM state bits are zero.  If non-zero (ie. userspace was in
+	 * transaction), go do TM stuff
+	 */
+	rldicl.	r0, r12, (64-MSR_TS_LG), (64-2)
+	bne-	2f
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+	bl	load_up_fpu
+	b	fast_exception_return
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2:	/* User process was in a transaction */
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	fp_unavailable_tm
+	b	ret_from_except
+#endif
+
+
+EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED)
+EXC_VIRT_OOL_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED)
+TRAMP_KVM(PACA_EXGEN, 0x900)
+EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
+
+
+EXC_REAL_OOL_HV(hdecrementer, 0x980, 0x80)
+EXC_VIRT_OOL_HV(hdecrementer, 0x4980, 0x80, 0x980)
+TRAMP_KVM_HV(PACA_EXGEN, 0x980)
+EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
+
+
+EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100, IRQS_DISABLED)
+EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00, IRQS_DISABLED)
+TRAMP_KVM(PACA_EXGEN, 0xa00)
+#ifdef CONFIG_PPC_DOORBELL
+EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
+#else
+EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception)
+#endif
+
+
+EXC_REAL(trap_0b, 0xb00, 0x100)
+EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00)
+TRAMP_KVM(PACA_EXGEN, 0xb00)
+EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
+
+/*
+ * system call / hypercall (0xc00, 0x4c00)
+ *
+ * The system call exception is invoked with "sc 0" and does not alter HV bit.
+ * There is support for kernel code to invoke system calls but there are no
+ * in-tree users.
+ *
+ * The hypercall is invoked with "sc 1" and sets HV=1.
+ *
+ * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to
+ * 0x4c00 virtual mode.
+ *
+ * Call convention:
+ *
+ * syscall register convention is in Documentation/powerpc/syscall64-abi.txt
+ *
+ * For hypercalls, the register convention is as follows:
+ * r0 volatile
+ * r1-2 nonvolatile
+ * r3 volatile parameter and return value for status
+ * r4-r10 volatile input and output value
+ * r11 volatile hypercall number and output value
+ * r12 volatile input and output value
+ * r13-r31 nonvolatile
+ * LR nonvolatile
+ * CTR volatile
+ * XER volatile
+ * CR0-1 CR5-7 volatile
+ * CR2-4 nonvolatile
+ * Other registers nonvolatile
+ *
+ * The intersection of volatile registers that don't contain possible
+ * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry
+ * without saving, though xer is not a good idea to use, as hardware may
+ * interpret some bits so it may be costly to change them.
+ */
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	/*
+	 * There is a little bit of juggling to get syscall and hcall
+	 * working well. Save r13 in ctr to avoid using SPRG scratch
+	 * register.
+	 *
+	 * Userspace syscalls have already saved the PPR, hcalls must save
+	 * it before setting HMT_MEDIUM.
+	 */
+#define SYSCALL_KVMTEST							\
+	mtctr	r13;							\
+	GET_PACA(r13);							\
+	std	r10,PACA_EXGEN+EX_R10(r13);				\
+	INTERRUPT_TO_KERNEL;						\
+	KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \
+	HMT_MEDIUM;							\
+	mfctr	r9;
+
+#else
+#define SYSCALL_KVMTEST							\
+	HMT_MEDIUM;							\
+	mr	r9,r13;							\
+	GET_PACA(r13);							\
+	INTERRUPT_TO_KERNEL;
+#endif
+	
+#define LOAD_SYSCALL_HANDLER(reg)					\
+	__LOAD_HANDLER(reg, system_call_common)
+
+/*
+ * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9,
+ * and HMT_MEDIUM.
+ */
+#define SYSCALL_REAL	 					\
+	mfspr	r11,SPRN_SRR0 ;					\
+	mfspr	r12,SPRN_SRR1 ;					\
+	LOAD_SYSCALL_HANDLER(r10) ; 				\
+	mtspr	SPRN_SRR0,r10 ; 				\
+	ld	r10,PACAKMSR(r13) ;				\
+	mtspr	SPRN_SRR1,r10 ; 				\
+	RFI_TO_KERNEL ;						\
+	b	. ;	/* prevent speculative execution */
+
+#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH
+#define SYSCALL_FASTENDIAN_TEST					\
+BEGIN_FTR_SECTION						\
+	cmpdi	r0,0x1ebe ; 					\
+	beq-	1f ;						\
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
+
+#define SYSCALL_FASTENDIAN					\
+	/* Fast LE/BE switch system call */			\
+1:	mfspr	r12,SPRN_SRR1 ;					\
+	xori	r12,r12,MSR_LE ;				\
+	mtspr	SPRN_SRR1,r12 ;					\
+	mr	r13,r9 ;					\
+	RFI_TO_USER ;	/* return to userspace */		\
+	b	. ;	/* prevent speculative execution */
+#else
+#define SYSCALL_FASTENDIAN_TEST
+#define SYSCALL_FASTENDIAN
+#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */
+
+#if defined(CONFIG_RELOCATABLE)
+	/*
+	 * We can't branch directly so we do it via the CTR which
+	 * is volatile across system calls.
+	 */
+#define SYSCALL_VIRT						\
+	LOAD_SYSCALL_HANDLER(r10) ;				\
+	mtctr	r10 ;						\
+	mfspr	r11,SPRN_SRR0 ;					\
+	mfspr	r12,SPRN_SRR1 ;					\
+	li	r10,MSR_RI ;					\
+	mtmsrd 	r10,1 ;						\
+	bctr ;
+#else
+	/* We can branch directly */
+#define SYSCALL_VIRT						\
+	mfspr	r11,SPRN_SRR0 ;					\
+	mfspr	r12,SPRN_SRR1 ;					\
+	li	r10,MSR_RI ;					\
+	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
+	b	system_call_common ;
+#endif
+
+EXC_REAL_BEGIN(system_call, 0xc00, 0x100)
+	SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */
+	SYSCALL_FASTENDIAN_TEST
+	SYSCALL_REAL
+	SYSCALL_FASTENDIAN
+EXC_REAL_END(system_call, 0xc00, 0x100)
+
+EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
+	SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */
+	SYSCALL_FASTENDIAN_TEST
+	SYSCALL_VIRT
+	SYSCALL_FASTENDIAN
+EXC_VIRT_END(system_call, 0x4c00, 0x100)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	/*
+	 * This is a hcall, so register convention is as above, with these
+	 * differences:
+	 * r13 = PACA
+	 * ctr = orig r13
+	 * orig r10 saved in PACA
+	 */
+TRAMP_KVM_BEGIN(do_kvm_0xc00)
+	 /*
+	  * Save the PPR (on systems that support it) before changing to
+	  * HMT_MEDIUM. That allows the KVM code to save that value into the
+	  * guest state (it is the guest's PPR value).
+	  */
+	OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR)
+	HMT_MEDIUM
+	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR)
+	mfctr	r10
+	SET_SCRATCH0(r10)
+	std	r9,PACA_EXGEN+EX_R9(r13)
+	mfcr	r9
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+#endif
+
+
+EXC_REAL(single_step, 0xd00, 0x100)
+EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00)
+TRAMP_KVM(PACA_EXGEN, 0xd00)
+EXC_COMMON(single_step_common, 0xd00, single_step_exception)
+
+EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20)
+EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
+EXC_COMMON_BEGIN(h_data_storage_common)
+	mfspr   r10,SPRN_HDAR
+	std     r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr   r10,SPRN_HDSISR
+	stw     r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
+	bl      save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      unknown_exception
+	b       ret_from_except
+
+
+EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20)
+EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
+EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
+
+
+EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20)
+EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe40)
+EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
+
+
+/*
+ * hmi_exception trampoline is a special case. It jumps to hmi_exception_early
+ * first, and then eventaully from there to the trampoline to get into virtual
+ * mode.
+ */
+__EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0x20, hmi_exception_early)
+__TRAMP_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60, IRQS_DISABLED)
+EXC_VIRT_NONE(0x4e60, 0x20)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
+TRAMP_REAL_BEGIN(hmi_exception_early)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
+	mr	r10,r1			/* Save r1 */
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack for realmode */
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
+	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
+	mfspr	r12,SPRN_HSRR1		/* Save HSRR1 */
+	EXCEPTION_PROLOG_COMMON_1()
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
+	EXCEPTION_PROLOG_COMMON_3(0xe60)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	BRANCH_LINK_TO_FAR(DOTSYM(hmi_exception_realmode)) /* Function call ABI */
+	cmpdi	cr0,r3,0
+
+	/* Windup the stack. */
+	/* Move original HSRR0 and HSRR1 into the respective regs */
+	ld	r9,_MSR(r1)
+	mtspr	SPRN_HSRR1,r9
+	ld	r3,_NIP(r1)
+	mtspr	SPRN_HSRR0,r3
+	ld	r9,_CTR(r1)
+	mtctr	r9
+	ld	r9,_XER(r1)
+	mtxer	r9
+	ld	r9,_LINK(r1)
+	mtlr	r9
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+	REST_GPR(10, r1)
+	ld	r11,_CCR(r1)
+	REST_2GPRS(12, r1)
+	bne	1f
+	mtcr	r11
+	REST_GPR(11, r1)
+	ld	r1,GPR1(r1)
+	HRFI_TO_USER_OR_KERNEL
+
+1:	mtcr	r11
+	REST_GPR(11, r1)
+	ld	r1,GPR1(r1)
+
+	/*
+	 * Go to virtual mode and pull the HMI event information from
+	 * firmware.
+	 */
+	.globl hmi_exception_after_realmode
+hmi_exception_after_realmode:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	tramp_real_hmi_exception
+
+EXC_COMMON_BEGIN(hmi_exception_common)
+EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception,
+        ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON)
+
+EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20, IRQS_DISABLED)
+EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80, IRQS_DISABLED)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe80)
+#ifdef CONFIG_PPC_DOORBELL
+EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
+#else
+EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception)
+#endif
+
+
+EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20, IRQS_DISABLED)
+EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0, IRQS_DISABLED)
+TRAMP_KVM_HV(PACA_EXGEN, 0xea0)
+EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
+
+
+EXC_REAL_NONE(0xec0, 0x20)
+EXC_VIRT_NONE(0x4ec0, 0x20)
+EXC_REAL_NONE(0xee0, 0x20)
+EXC_VIRT_NONE(0x4ee0, 0x20)
+
+
+EXC_REAL_OOL_MASKABLE(performance_monitor, 0xf00, 0x20, IRQS_PMI_DISABLED)
+EXC_VIRT_OOL_MASKABLE(performance_monitor, 0x4f00, 0x20, 0xf00, IRQS_PMI_DISABLED)
+TRAMP_KVM(PACA_EXGEN, 0xf00)
+EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
+
+
+EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20)
+EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20)
+TRAMP_KVM(PACA_EXGEN, 0xf20)
+EXC_COMMON_BEGIN(altivec_unavailable_common)
+	EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN)
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	beq	1f
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+  BEGIN_FTR_SECTION_NESTED(69)
+	/* Test if 2 TM state bits are zero.  If non-zero (ie. userspace was in
+	 * transaction), go do TM stuff
+	 */
+	rldicl.	r0, r12, (64-MSR_TS_LG), (64-2)
+	bne-	2f
+  END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
+#endif
+	bl	load_up_altivec
+	b	fast_exception_return
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2:	/* User process was in a transaction */
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	altivec_unavailable_tm
+	b	ret_from_except
+#endif
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	altivec_unavailable_exception
+	b	ret_from_except
+
+
+EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20)
+EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40)
+TRAMP_KVM(PACA_EXGEN, 0xf40)
+EXC_COMMON_BEGIN(vsx_unavailable_common)
+	EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN)
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	beq	1f
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+  BEGIN_FTR_SECTION_NESTED(69)
+	/* Test if 2 TM state bits are zero.  If non-zero (ie. userspace was in
+	 * transaction), go do TM stuff
+	 */
+	rldicl.	r0, r12, (64-MSR_TS_LG), (64-2)
+	bne-	2f
+  END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
+#endif
+	b	load_up_vsx
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2:	/* User process was in a transaction */
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	vsx_unavailable_tm
+	b	ret_from_except
+#endif
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	bl	save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	vsx_unavailable_exception
+	b	ret_from_except
+
+
+EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20)
+EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60)
+TRAMP_KVM(PACA_EXGEN, 0xf60)
+EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
+
+
+EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20)
+EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80)
+TRAMP_KVM_HV(PACA_EXGEN, 0xf80)
+EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
+
+
+EXC_REAL_NONE(0xfa0, 0x20)
+EXC_VIRT_NONE(0x4fa0, 0x20)
+EXC_REAL_NONE(0xfc0, 0x20)
+EXC_VIRT_NONE(0x4fc0, 0x20)
+EXC_REAL_NONE(0xfe0, 0x20)
+EXC_VIRT_NONE(0x4fe0, 0x20)
+
+EXC_REAL_NONE(0x1000, 0x100)
+EXC_VIRT_NONE(0x5000, 0x100)
+EXC_REAL_NONE(0x1100, 0x100)
+EXC_VIRT_NONE(0x5100, 0x100)
+
+#ifdef CONFIG_CBE_RAS
+EXC_REAL_HV(cbe_system_error, 0x1200, 0x100)
+EXC_VIRT_NONE(0x5200, 0x100)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200)
+EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
+#else /* CONFIG_CBE_RAS */
+EXC_REAL_NONE(0x1200, 0x100)
+EXC_VIRT_NONE(0x5200, 0x100)
+#endif
+
+
+EXC_REAL(instruction_breakpoint, 0x1300, 0x100)
+EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300)
+TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300)
+EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
+
+EXC_REAL_NONE(0x1400, 0x100)
+EXC_VIRT_NONE(0x5400, 0x100)
+
+EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
+	mtspr	SPRN_SPRG_HSCRATCH0,r13
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+	mfspr	r10,SPRN_HSRR1
+	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
+	bne+	denorm_assist
+#endif
+
+	KVMTEST_HV(0x1500)
+	EXCEPTION_PROLOG_2(denorm_common, EXC_HV)
+EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100)
+	b	exc_real_0x1500_denorm_exception_hv
+EXC_VIRT_END(denorm_exception, 0x5500, 0x100)
+#else
+EXC_VIRT_NONE(0x5500, 0x100)
+#endif
+
+TRAMP_KVM_HV(PACA_EXGEN, 0x1500)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+TRAMP_REAL_BEGIN(denorm_assist)
+BEGIN_FTR_SECTION
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER6 do that here for all FP regs.
+ */
+	mfmsr	r10
+	ori	r10,r10,(MSR_FP|MSR_FE0|MSR_FE1)
+	xori	r10,r10,(MSR_FE0|MSR_FE1)
+	mtmsrd	r10
+	sync
+
+#define FMR2(n)  fmr (n), (n) ; fmr n+1, n+1
+#define FMR4(n)  FMR2(n) ; FMR2(n+2)
+#define FMR8(n)  FMR4(n) ; FMR4(n+4)
+#define FMR16(n) FMR8(n) ; FMR8(n+8)
+#define FMR32(n) FMR16(n) ; FMR16(n+16)
+	FMR32(0)
+
+FTR_SECTION_ELSE
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER7 do that here for the first 32 VSX registers only.
+ */
+	mfmsr	r10
+	oris	r10,r10,MSR_VSX@h
+	mtmsrd	r10
+	sync
+
+#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1)
+#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2)
+#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4)
+#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8)
+#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16)
+	XVCPSGNDP32(0)
+
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206)
+
+BEGIN_FTR_SECTION
+	b	denorm_done
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER8 we need to do that for all 64 VSX registers
+ */
+	XVCPSGNDP32(32)
+denorm_done:
+	mfspr	r11,SPRN_HSRR0
+	subi	r11,r11,4
+	mtspr	SPRN_HSRR0,r11
+	mtcrf	0x80,r9
+	ld	r9,PACA_EXGEN+EX_R9(r13)
+	RESTORE_PPR_PACA(PACA_EXGEN, r10)
+BEGIN_FTR_SECTION
+	ld	r10,PACA_EXGEN+EX_CFAR(r13)
+	mtspr	SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	ld	r12,PACA_EXGEN+EX_R12(r13)
+	ld	r13,PACA_EXGEN+EX_R13(r13)
+	HRFI_TO_UNKNOWN
+	b	.
+#endif
+
+EXC_COMMON(denorm_common, 0x1500, unknown_exception)
+
+
+#ifdef CONFIG_CBE_RAS
+EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100)
+EXC_VIRT_NONE(0x5600, 0x100)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600)
+EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
+#else /* CONFIG_CBE_RAS */
+EXC_REAL_NONE(0x1600, 0x100)
+EXC_VIRT_NONE(0x5600, 0x100)
+#endif
+
+
+EXC_REAL(altivec_assist, 0x1700, 0x100)
+EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700)
+TRAMP_KVM(PACA_EXGEN, 0x1700)
+#ifdef CONFIG_ALTIVEC
+EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
+#else
+EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception)
+#endif
+
+
+#ifdef CONFIG_CBE_RAS
+EXC_REAL_HV(cbe_thermal, 0x1800, 0x100)
+EXC_VIRT_NONE(0x5800, 0x100)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800)
+EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
+#else /* CONFIG_CBE_RAS */
+EXC_REAL_NONE(0x1800, 0x100)
+EXC_VIRT_NONE(0x5800, 0x100)
+#endif
+
+#ifdef CONFIG_PPC_WATCHDOG
+
+#define MASKED_DEC_HANDLER_LABEL 3f
+
+#define MASKED_DEC_HANDLER(_H)				\
+3: /* soft-nmi */					\
+	std	r12,PACA_EXGEN+EX_R12(r13);		\
+	GET_SCRATCH0(r10);				\
+	std	r10,PACA_EXGEN+EX_R13(r13);		\
+	EXCEPTION_PROLOG_2(soft_nmi_common, _H)
+
+/*
+ * Branch to soft_nmi_interrupt using the emergency stack. The emergency
+ * stack is one that is usable by maskable interrupts so long as MSR_EE
+ * remains off. It is used for recovery when something has corrupted the
+ * normal kernel stack, for example. The "soft NMI" must not use the process
+ * stack because we want irq disabled sections to avoid touching the stack
+ * at all (other than PMU interrupts), so use the emergency stack for this,
+ * and run it entirely with interrupts hard disabled.
+ */
+EXC_COMMON_BEGIN(soft_nmi_common)
+	mr	r10,r1
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,INT_FRAME_SIZE
+	EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900,
+			system_reset, soft_nmi_interrupt,
+			ADD_NVGPRS;ADD_RECONCILE)
+	b	ret_from_except
+
+#else /* CONFIG_PPC_WATCHDOG */
+#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */
+#define MASKED_DEC_HANDLER(_H)
+#endif /* CONFIG_PPC_WATCHDOG */
+
+/*
+ * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
+ * - If it was a decrementer interrupt, we bump the dec to max and and return.
+ * - If it was a doorbell we return immediately since doorbells are edge
+ *   triggered and won't automatically refire.
+ * - If it was a HMI we return immediately since we handled it in realmode
+ *   and it won't refire.
+ * - Else it is one of PACA_IRQ_MUST_HARD_MASK, so hard disable and return.
+ * This is called with r10 containing the value to OR to the paca field.
+ */
+#define MASKED_INTERRUPT(_H)				\
+masked_##_H##interrupt:					\
+	std	r11,PACA_EXGEN+EX_R11(r13);		\
+	lbz	r11,PACAIRQHAPPENED(r13);		\
+	or	r11,r11,r10;				\
+	stb	r11,PACAIRQHAPPENED(r13);		\
+	cmpwi	r10,PACA_IRQ_DEC;			\
+	bne	1f;					\
+	lis	r10,0x7fff;				\
+	ori	r10,r10,0xffff;				\
+	mtspr	SPRN_DEC,r10;				\
+	b	MASKED_DEC_HANDLER_LABEL;		\
+1:	andi.	r10,r10,PACA_IRQ_MUST_HARD_MASK;	\
+	beq	2f;					\
+	mfspr	r10,SPRN_##_H##SRR1;			\
+	xori	r10,r10,MSR_EE; /* clear MSR_EE */	\
+	mtspr	SPRN_##_H##SRR1,r10;			\
+	ori	r11,r11,PACA_IRQ_HARD_DIS;		\
+	stb	r11,PACAIRQHAPPENED(r13);		\
+2:	/* done */					\
+	mtcrf	0x80,r9;				\
+	std	r1,PACAR1(r13);				\
+	ld	r9,PACA_EXGEN+EX_R9(r13);		\
+	ld	r10,PACA_EXGEN+EX_R10(r13);		\
+	ld	r11,PACA_EXGEN+EX_R11(r13);		\
+	/* returns to kernel where r13 must be set up, so don't restore it */ \
+	##_H##RFI_TO_KERNEL;				\
+	b	.;					\
+	MASKED_DEC_HANDLER(_H)
+
+TRAMP_REAL_BEGIN(stf_barrier_fallback)
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	sync
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ori	31,31,0
+	.rept 14
+	b	1f
+1:
+	.endr
+	blr
+
+/* Clobbers r10, r11, ctr */
+.macro L1D_DISPLACEMENT_FLUSH
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SIZE(r13)
+	srdi	r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
+	mtctr	r11
+	DCBT_BOOK3S_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+
+	/*
+	 * The load addresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+1:
+	ld	r11,(0x80 + 8)*0(r10)
+	ld	r11,(0x80 + 8)*1(r10)
+	ld	r11,(0x80 + 8)*2(r10)
+	ld	r11,(0x80 + 8)*3(r10)
+	ld	r11,(0x80 + 8)*4(r10)
+	ld	r11,(0x80 + 8)*5(r10)
+	ld	r11,(0x80 + 8)*6(r10)
+	ld	r11,(0x80 + 8)*7(r10)
+	addi	r10,r10,0x80*8
+	bdnz	1b
+.endm
+
+TRAMP_REAL_BEGIN(entry_flush_fallback)
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	mfctr	r9
+	L1D_DISPLACEMENT_FLUSH
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	blr
+
+TRAMP_REAL_BEGIN(rfi_flush_fallback)
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r1,PACA_EXRFI+EX_R12(r13)
+	ld	r1,PACAKSAVE(r13)
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	mfctr	r9
+	L1D_DISPLACEMENT_FLUSH
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r1,PACA_EXRFI+EX_R12(r13)
+	GET_SCRATCH0(r13);
+	rfid
+
+TRAMP_REAL_BEGIN(hrfi_flush_fallback)
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r1,PACA_EXRFI+EX_R12(r13)
+	ld	r1,PACAKSAVE(r13)
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	mfctr	r9
+	L1D_DISPLACEMENT_FLUSH
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r1,PACA_EXRFI+EX_R12(r13)
+	GET_SCRATCH0(r13);
+	hrfid
+
+USE_TEXT_SECTION()
+
+_GLOBAL(do_uaccess_flush)
+	UACCESS_FLUSH_FIXUP_SECTION
+	nop
+	nop
+	nop
+	blr
+	L1D_DISPLACEMENT_FLUSH
+	blr
+_ASM_NOKPROBE_SYMBOL(do_uaccess_flush)
+EXPORT_SYMBOL(do_uaccess_flush)
+
+/*
+ * Real mode exceptions actually use this too, but alternate
+ * instruction code patches (which end up in the common .text area)
+ * cannot reach these if they are put there.
+ */
+USE_FIXED_SECTION(virt_trampolines)
+	MASKED_INTERRUPT()
+	MASKED_INTERRUPT(H)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_SRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_SRR0, r13
+	GET_SCRATCH0(r13)
+	RFI_TO_KERNEL
+	b	.
+
+TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_HSRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_HSRR0, r13
+	GET_SCRATCH0(r13)
+	HRFI_TO_KERNEL
+	b	.
+#endif
+
+/*
+ * Ensure that any handlers that get invoked from the exception prologs
+ * above are below the first 64KB (0x10000) of the kernel image because
+ * the prologs assemble the addresses of these handlers using the
+ * LOAD_HANDLER macro, which uses an ori instruction.
+ */
+
+/*** Common interrupt handlers ***/
+
+
+	/*
+	 * Relocation-on interrupts: A subset of the interrupts can be delivered
+	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
+	 * it.  Addresses are the same as the original interrupt addresses, but
+	 * offset by 0xc000000000004000.
+	 * It's impossible to receive interrupts below 0x300 via this mechanism.
+	 * KVM: None of these traps are from the guest ; anything that escalated
+	 * to HV=1 from HV=0 is delivered via real mode handlers.
+	 */
+
+	/*
+	 * This uses the standard macro, since the original 0x300 vector
+	 * only has extra guff for STAB-based processors -- which never
+	 * come here.
+	 */
+
+EXC_COMMON_BEGIN(ppc64_runlatch_on_trampoline)
+	b	__ppc64_runlatch_on
+
+USE_FIXED_SECTION(virt_trampolines)
+	/*
+	 * The __end_interrupts marker must be past the out-of-line (OOL)
+	 * handlers, so that they are copied to real address 0x100 when running
+	 * a relocatable kernel. This ensures they can be reached from the short
+	 * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch
+	 * directly, without using LOAD_HANDLER().
+	 */
+	.align	7
+	.globl	__end_interrupts
+__end_interrupts:
+DEFINE_FIXED_SYMBOL(__end_interrupts)
+
+#ifdef CONFIG_PPC_970_NAP
+EXC_COMMON_BEGIN(power4_fixup_nap)
+	andc	r9,r9,r10
+	std	r9,TI_LOCAL_FLAGS(r11)
+	ld	r10,_LINK(r1)		/* make idle task do the */
+	std	r10,_NIP(r1)		/* equivalent of a blr */
+	blr
+#endif
+
+CLOSE_FIXED_SECTION(real_vectors);
+CLOSE_FIXED_SECTION(real_trampolines);
+CLOSE_FIXED_SECTION(virt_vectors);
+CLOSE_FIXED_SECTION(virt_trampolines);
+
+USE_TEXT_SECTION()
+
+/*
+ * Hash table stuff
+ */
+	.balign	IFETCH_ALIGN_BYTES
+do_hash_page:
+#ifdef CONFIG_PPC_BOOK3S_64
+	lis	r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h
+	ori	r0,r0,DSISR_BAD_FAULT_64S@l
+	and.	r0,r4,r0		/* weird error? */
+	bne-	handle_page_fault	/* if not, try to insert a HPTE */
+	CURRENT_THREAD_INFO(r11, r1)
+	lwz	r0,TI_PREEMPT(r11)	/* If we're in an "NMI" */
+	andis.	r0,r0,NMI_MASK@h	/* (i.e. an irq when soft-disabled) */
+	bne	77f			/* then don't call hash_page now */
+
+	/*
+	 * r3 contains the faulting address
+	 * r4 msr
+	 * r5 contains the trap number
+	 * r6 contains dsisr
+	 *
+	 * at return r3 = 0 for success, 1 for page fault, negative for error
+	 */
+        mr 	r4,r12
+	ld      r6,_DSISR(r1)
+	bl	__hash_page		/* build HPTE if possible */
+        cmpdi	r3,0			/* see if __hash_page succeeded */
+
+	/* Success */
+	beq	fast_exc_return_irq	/* Return from exception on success */
+
+	/* Error */
+	blt-	13f
+
+	/* Reload DSISR into r4 for the DABR check below */
+	ld      r4,_DSISR(r1)
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+/* Here we have a page fault that hash_page can't handle. */
+handle_page_fault:
+11:	andis.  r0,r4,DSISR_DABRMATCH@h
+	bne-    handle_dabr_fault
+	ld	r4,_DAR(r1)
+	ld	r5,_DSISR(r1)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	do_page_fault
+	cmpdi	r3,0
+	beq+	ret_from_except_lite
+	bl	save_nvgprs
+	mr	r5,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	lwz	r4,_DAR(r1)
+	bl	bad_page_fault
+	b	ret_from_except
+
+/* We have a data breakpoint exception - handle it */
+handle_dabr_fault:
+	bl	save_nvgprs
+	ld      r4,_DAR(r1)
+	ld      r5,_DSISR(r1)
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      do_break
+	/*
+	 * do_break() may have changed the NV GPRS while handling a breakpoint.
+	 * If so, we need to restore them with their updated values. Don't use
+	 * ret_from_except_lite here.
+	 */
+	b       ret_from_except
+
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/* We have a page fault that hash_page could handle but HV refused
+ * the PTE insertion
+ */
+13:	bl	save_nvgprs
+	mr	r5,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	ld	r4,_DAR(r1)
+	bl	low_hash_fault
+	b	ret_from_except
+#endif
+
+/*
+ * We come here as a result of a DSI at a point where we don't want
+ * to call hash_page, such as when we are accessing memory (possibly
+ * user memory) inside a PMU interrupt that occurred while interrupts
+ * were soft-disabled.  We want to invoke the exception handler for
+ * the access, or panic if there isn't a handler.
+ */
+77:	bl	save_nvgprs
+	mr	r4,r3
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	li	r5,SIGSEGV
+	bl	bad_page_fault
+	b	ret_from_except
+
+/*
+ * Here we have detected that the kernel stack pointer is bad.
+ * R9 contains the saved CR, r13 points to the paca,
+ * r10 contains the (bad) kernel stack pointer,
+ * r11 and r12 contain the saved SRR0 and SRR1.
+ * We switch to using an emergency stack, save the registers there,
+ * and call kernel_bad_stack(), which panics.
+ */
+bad_stack:
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,64+INT_FRAME_SIZE
+	std	r9,_CCR(r1)
+	std	r10,GPR1(r1)
+	std	r11,_NIP(r1)
+	std	r12,_MSR(r1)
+	mfspr	r11,SPRN_DAR
+	mfspr	r12,SPRN_DSISR
+	std	r11,_DAR(r1)
+	std	r12,_DSISR(r1)
+	mflr	r10
+	mfctr	r11
+	mfxer	r12
+	std	r10,_LINK(r1)
+	std	r11,_CTR(r1)
+	std	r12,_XER(r1)
+	SAVE_GPR(0,r1)
+	SAVE_GPR(2,r1)
+	ld	r10,EX_R3(r3)
+	std	r10,GPR3(r1)
+	SAVE_GPR(4,r1)
+	SAVE_4GPRS(5,r1)
+	ld	r9,EX_R9(r3)
+	ld	r10,EX_R10(r3)
+	SAVE_2GPRS(9,r1)
+	ld	r9,EX_R11(r3)
+	ld	r10,EX_R12(r3)
+	ld	r11,EX_R13(r3)
+	std	r9,GPR11(r1)
+	std	r10,GPR12(r1)
+	std	r11,GPR13(r1)
+BEGIN_FTR_SECTION
+	ld	r10,EX_CFAR(r3)
+	std	r10,ORIG_GPR3(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+	SAVE_8GPRS(14,r1)
+	SAVE_10GPRS(22,r1)
+	lhz	r12,PACA_TRAP_SAVE(r13)
+	std	r12,_TRAP(r1)
+	addi	r11,r1,INT_FRAME_SIZE
+	std	r11,0(r1)
+	li	r12,0
+	std	r12,0(r11)
+	ld	r2,PACATOC(r13)
+	ld	r11,exception_marker@toc(r2)
+	std	r12,RESULT(r1)
+	std	r11,STACK_FRAME_OVERHEAD-16(r1)
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	kernel_bad_stack
+	b	1b
+_ASM_NOKPROBE_SYMBOL(bad_stack);
+
+/*
+ * When doorbell is triggered from system reset wakeup, the message is
+ * not cleared, so it would fire again when EE is enabled.
+ *
+ * When coming from local_irq_enable, there may be the same problem if
+ * we were hard disabled.
+ *
+ * Execute msgclr to clear pending exceptions before handling it.
+ */
+h_doorbell_common_msgclr:
+	LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
+	PPC_MSGCLR(3)
+	b 	h_doorbell_common
+
+doorbell_super_common_msgclr:
+	LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
+	PPC_MSGCLRP(3)
+	b 	doorbell_super_common
+
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate
+ * which kind of interrupt. MSR:EE is already off. We generate a
+ * stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ *
+ * Note that we don't specify LR as the NIP (return address) for
+ * the interrupt because that would unbalance the return branch
+ * predictor.
+ */
+_GLOBAL(__replay_interrupt)
+	/* We are going to jump to the exception common code which
+	 * will retrieve various register values from the PACA which
+	 * we don't give a damn about, so we don't bother storing them.
+	 */
+	mfmsr	r12
+	LOAD_REG_ADDR(r11, replay_interrupt_return)
+	mfcr	r9
+	ori	r12,r12,MSR_EE
+	cmpwi	r3,0x900
+	beq	decrementer_common
+	cmpwi	r3,0x500
+BEGIN_FTR_SECTION
+	beq	h_virt_irq_common
+FTR_SECTION_ELSE
+	beq	hardware_interrupt_common
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300)
+	cmpwi	r3,0xf00
+	beq	performance_monitor_common
+BEGIN_FTR_SECTION
+	cmpwi	r3,0xa00
+	beq	h_doorbell_common_msgclr
+	cmpwi	r3,0xe60
+	beq	hmi_exception_common
+FTR_SECTION_ELSE
+	cmpwi	r3,0xa00
+	beq	doorbell_super_common_msgclr
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+replay_interrupt_return:
+	blr
+
+_ASM_NOKPROBE_SYMBOL(__replay_interrupt)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
new file mode 100644
index 000000000..c02c95287
--- /dev/null
+++ b/arch/powerpc/kernel/fadump.c
@@ -0,0 +1,1617 @@
+/*
+ * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
+ * dump with assistance from firmware. This approach does not use kexec,
+ * instead firmware assists in booting the kdump kernel while preserving
+ * memory contents. The most of the code implementation has been adapted
+ * from phyp assisted dump implementation written by Linas Vepstas and
+ * Manish Ahuja
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+
+#include <asm/debugfs.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+#include <asm/setup.h>
+
+static struct fw_dump fw_dump;
+static struct fadump_mem_struct fdm;
+static const struct fadump_mem_struct *fdm_active;
+
+static DEFINE_MUTEX(fadump_mutex);
+struct fad_crash_memory_ranges *crash_memory_ranges;
+int crash_memory_ranges_size;
+int crash_mem_ranges;
+int max_crash_mem_ranges;
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node,
+			const char *uname, int depth, void *data)
+{
+	const __be32 *sections;
+	int i, num_sections;
+	int size;
+	const __be32 *token;
+
+	if (depth != 1 || strcmp(uname, "rtas") != 0)
+		return 0;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return 1;
+
+	fw_dump.fadump_supported = 1;
+	fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token);
+
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+	if (fdm_active)
+		fw_dump.dump_active = 1;
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 * For each dump section type supported, a 32bit cell which defines
+	 * the ID of a supported section followed by two 32 bit cells which
+	 * gives teh size of the section in bytes.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return 1;
+
+	num_sections = size / (3 * sizeof(u32));
+
+	for (i = 0; i < num_sections; i++, sections += 3) {
+		u32 type = (u32)of_read_number(sections, 1);
+
+		switch (type) {
+		case FADUMP_CPU_STATE_DATA:
+			fw_dump.cpu_state_data_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		case FADUMP_HPTE_REGION:
+			fw_dump.hpte_region_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		}
+	}
+
+	return 1;
+}
+
+/*
+ * If fadump is registered, check if the memory provided
+ * falls within boot memory area and reserved memory area.
+ */
+int is_fadump_memory_area(u64 addr, ulong size)
+{
+	u64 d_start = fw_dump.reserve_dump_area_start;
+	u64 d_end = d_start + fw_dump.reserve_dump_area_size;
+
+	if (!fw_dump.dump_registered)
+		return 0;
+
+	if (((addr + size) > d_start) && (addr <= d_end))
+		return 1;
+
+	return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
+}
+
+int should_fadump_crash(void)
+{
+	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+		return 0;
+	return 1;
+}
+
+int is_fadump_active(void)
+{
+	return fw_dump.dump_active;
+}
+
+/*
+ * Returns 1, if there are no holes in boot memory area,
+ * 0 otherwise.
+ */
+static int is_boot_memory_area_contiguous(void)
+{
+	struct memblock_region *reg;
+	unsigned long tstart, tend;
+	unsigned long start_pfn = PHYS_PFN(RMA_START);
+	unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size);
+	unsigned int ret = 0;
+
+	for_each_memblock(memory, reg) {
+		tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+		tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+		if (tstart < tend) {
+			/* Memory hole from start_pfn to tstart */
+			if (tstart > start_pfn)
+				break;
+
+			if (tend == end_pfn) {
+				ret = 1;
+				break;
+			}
+
+			start_pfn = tend + 1;
+		}
+	}
+
+	return ret;
+}
+
+/* Print firmware assisted dump configurations for debugging purpose. */
+static void fadump_show_config(void)
+{
+	pr_debug("Support for firmware-assisted dump (fadump): %s\n",
+			(fw_dump.fadump_supported ? "present" : "no support"));
+
+	if (!fw_dump.fadump_supported)
+		return;
+
+	pr_debug("Fadump enabled    : %s\n",
+				(fw_dump.fadump_enabled ? "yes" : "no"));
+	pr_debug("Dump Active       : %s\n",
+				(fw_dump.dump_active ? "yes" : "no"));
+	pr_debug("Dump section sizes:\n");
+	pr_debug("    CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
+	pr_debug("    HPTE region size   : %lx\n", fw_dump.hpte_region_size);
+	pr_debug("Boot memory size  : %lx\n", fw_dump.boot_memory_size);
+}
+
+static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
+				unsigned long addr)
+{
+	if (!fdm)
+		return 0;
+
+	memset(fdm, 0, sizeof(struct fadump_mem_struct));
+	addr = addr & PAGE_MASK;
+
+	fdm->header.dump_format_version = cpu_to_be32(0x00000001);
+	fdm->header.dump_num_sections = cpu_to_be16(3);
+	fdm->header.dump_status_flag = 0;
+	fdm->header.offset_first_dump_section =
+		cpu_to_be32((u32)offsetof(struct fadump_mem_struct, cpu_state_data));
+
+	/*
+	 * Fields for disk dump option.
+	 * We are not using disk dump option, hence set these fields to 0.
+	 */
+	fdm->header.dd_block_size = 0;
+	fdm->header.dd_block_offset = 0;
+	fdm->header.dd_num_blocks = 0;
+	fdm->header.dd_offset_disk_path = 0;
+
+	/* set 0 to disable an automatic dump-reboot. */
+	fdm->header.max_time_auto = 0;
+
+	/* Kernel dump sections */
+	/* cpu state data section. */
+	fdm->cpu_state_data.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
+	fdm->cpu_state_data.source_data_type = cpu_to_be16(FADUMP_CPU_STATE_DATA);
+	fdm->cpu_state_data.source_address = 0;
+	fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size);
+	fdm->cpu_state_data.destination_address = cpu_to_be64(addr);
+	addr += fw_dump.cpu_state_data_size;
+
+	/* hpte region section */
+	fdm->hpte_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
+	fdm->hpte_region.source_data_type = cpu_to_be16(FADUMP_HPTE_REGION);
+	fdm->hpte_region.source_address = 0;
+	fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size);
+	fdm->hpte_region.destination_address = cpu_to_be64(addr);
+	addr += fw_dump.hpte_region_size;
+
+	/* RMA region section */
+	fdm->rmr_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
+	fdm->rmr_region.source_data_type = cpu_to_be16(FADUMP_REAL_MODE_REGION);
+	fdm->rmr_region.source_address = cpu_to_be64(RMA_START);
+	fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size);
+	fdm->rmr_region.destination_address = cpu_to_be64(addr);
+	addr += fw_dump.boot_memory_size;
+
+	return addr;
+}
+
+/**
+ * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
+ *
+ * Function to find the largest memory size we need to reserve during early
+ * boot process. This will be the size of the memory that is required for a
+ * kernel to boot successfully.
+ *
+ * This function has been taken from phyp-assisted dump feature implementation.
+ *
+ * returns larger of 256MB or 5% rounded down to multiples of 256MB.
+ *
+ * TODO: Come up with better approach to find out more accurate memory size
+ * that is required for a kernel to boot successfully.
+ *
+ */
+static inline unsigned long fadump_calculate_reserve_size(void)
+{
+	int ret;
+	unsigned long long base, size;
+
+	if (fw_dump.reserve_bootvar)
+		pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
+
+	/*
+	 * Check if the size is specified through crashkernel= cmdline
+	 * option. If yes, then use that but ignore base as fadump reserves
+	 * memory at a predefined offset.
+	 */
+	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+				&size, &base);
+	if (ret == 0 && size > 0) {
+		unsigned long max_size;
+
+		if (fw_dump.reserve_bootvar)
+			pr_info("Using 'crashkernel=' parameter for memory reservation.\n");
+
+		fw_dump.reserve_bootvar = (unsigned long)size;
+
+		/*
+		 * Adjust if the boot memory size specified is above
+		 * the upper limit.
+		 */
+		max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO;
+		if (fw_dump.reserve_bootvar > max_size) {
+			fw_dump.reserve_bootvar = max_size;
+			pr_info("Adjusted boot memory size to %luMB\n",
+				(fw_dump.reserve_bootvar >> 20));
+		}
+
+		return fw_dump.reserve_bootvar;
+	} else if (fw_dump.reserve_bootvar) {
+		/*
+		 * 'fadump_reserve_mem=' is being used to reserve memory
+		 * for firmware-assisted dump.
+		 */
+		return fw_dump.reserve_bootvar;
+	}
+
+	/* divide by 20 to get 5% of value */
+	size = memblock_phys_mem_size() / 20;
+
+	/* round it down in multiples of 256 */
+	size = size & ~0x0FFFFFFFUL;
+
+	/* Truncate to memory_limit. We don't want to over reserve the memory.*/
+	if (memory_limit && size > memory_limit)
+		size = memory_limit;
+
+	return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+}
+
+/*
+ * Calculate the total memory size required to be reserved for
+ * firmware-assisted dump registration.
+ */
+static unsigned long get_fadump_area_size(void)
+{
+	unsigned long size = 0;
+
+	size += fw_dump.cpu_state_data_size;
+	size += fw_dump.hpte_region_size;
+	size += fw_dump.boot_memory_size;
+	size += sizeof(struct fadump_crash_info_header);
+	size += sizeof(struct elfhdr); /* ELF core header.*/
+	size += sizeof(struct elf_phdr); /* place holder for cpu notes */
+	/* Program headers for crash memory regions. */
+	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
+
+	size = PAGE_ALIGN(size);
+	return size;
+}
+
+static void __init fadump_reserve_crash_area(unsigned long base,
+					     unsigned long size)
+{
+	struct memblock_region *reg;
+	unsigned long mstart, mend, msize;
+
+	for_each_memblock(memory, reg) {
+		mstart = max_t(unsigned long, base, reg->base);
+		mend = reg->base + reg->size;
+		mend = min(base + size, mend);
+
+		if (mstart < mend) {
+			msize = mend - mstart;
+			memblock_reserve(mstart, msize);
+			pr_info("Reserved %ldMB of memory at %#016lx for saving crash dump\n",
+				(msize >> 20), mstart);
+		}
+	}
+}
+
+int __init fadump_reserve_mem(void)
+{
+	unsigned long base, size, memory_boundary;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_INFO "Firmware-assisted dump is not supported on"
+				" this hardware\n");
+		fw_dump.fadump_enabled = 0;
+		return 0;
+	}
+	/*
+	 * Initialize boot memory size
+	 * If dump is active then we have already calculated the size during
+	 * first kernel.
+	 */
+	if (fdm_active)
+		fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len);
+	else
+		fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+
+	/*
+	 * Calculate the memory boundary.
+	 * If memory_limit is less than actual memory boundary then reserve
+	 * the memory for fadump beyond the memory_limit and adjust the
+	 * memory_limit accordingly, so that the running kernel can run with
+	 * specified memory_limit.
+	 */
+	if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
+		size = get_fadump_area_size();
+		if ((memory_limit + size) < memblock_end_of_DRAM())
+			memory_limit += size;
+		else
+			memory_limit = memblock_end_of_DRAM();
+		printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
+				" dump, now %#016llx\n", memory_limit);
+	}
+	if (memory_limit)
+		memory_boundary = memory_limit;
+	else
+		memory_boundary = memblock_end_of_DRAM();
+
+	if (fw_dump.dump_active) {
+		pr_info("Firmware-assisted dump is active.\n");
+
+#ifdef CONFIG_HUGETLB_PAGE
+		/*
+		 * FADump capture kernel doesn't care much about hugepages.
+		 * In fact, handling hugepages in capture kernel is asking for
+		 * trouble. So, disable HugeTLB support when fadump is active.
+		 */
+		hugetlb_disabled = true;
+#endif
+		/*
+		 * If last boot has crashed then reserve all the memory
+		 * above boot_memory_size so that we don't touch it until
+		 * dump is written to disk by userspace tool. This memory
+		 * will be released for general use once the dump is saved.
+		 */
+		base = fw_dump.boot_memory_size;
+		size = memory_boundary - base;
+		fadump_reserve_crash_area(base, size);
+
+		fw_dump.fadumphdr_addr =
+				be64_to_cpu(fdm_active->rmr_region.destination_address) +
+				be64_to_cpu(fdm_active->rmr_region.source_len);
+		pr_debug("fadumphdr_addr = %p\n",
+				(void *) fw_dump.fadumphdr_addr);
+	} else {
+		size = get_fadump_area_size();
+
+		/*
+		 * Reserve memory at an offset closer to bottom of the RAM to
+		 * minimize the impact of memory hot-remove operation. We can't
+		 * use memblock_find_in_range() here since it doesn't allocate
+		 * from bottom to top.
+		 */
+		for (base = fw_dump.boot_memory_size;
+		     base <= (memory_boundary - size);
+		     base += size) {
+			if (memblock_is_region_memory(base, size) &&
+			    !memblock_is_region_reserved(base, size))
+				break;
+		}
+		if ((base > (memory_boundary - size)) ||
+		    memblock_reserve(base, size)) {
+			pr_err("Failed to reserve memory\n");
+			return 0;
+		}
+
+		pr_info("Reserved %ldMB of memory at %ldMB for firmware-"
+			"assisted dump (System RAM: %ldMB)\n",
+			(unsigned long)(size >> 20),
+			(unsigned long)(base >> 20),
+			(unsigned long)(memblock_phys_mem_size() >> 20));
+	}
+
+	fw_dump.reserve_dump_area_start = base;
+	fw_dump.reserve_dump_area_size = size;
+	return 1;
+}
+
+unsigned long __init arch_reserved_kernel_pages(void)
+{
+	return memblock_reserved_size() / PAGE_SIZE;
+}
+
+/* Look for fadump= cmdline option. */
+static int __init early_fadump_param(char *p)
+{
+	if (!p)
+		return 1;
+
+	if (strncmp(p, "on", 2) == 0)
+		fw_dump.fadump_enabled = 1;
+	else if (strncmp(p, "off", 3) == 0)
+		fw_dump.fadump_enabled = 0;
+
+	return 0;
+}
+early_param("fadump", early_fadump_param);
+
+/*
+ * Look for fadump_reserve_mem= cmdline option
+ * TODO: Remove references to 'fadump_reserve_mem=' parameter,
+ *       the sooner 'crashkernel=' parameter is accustomed to.
+ */
+static int __init early_fadump_reserve_mem(char *p)
+{
+	if (p)
+		fw_dump.reserve_bootvar = memparse(p, &p);
+	return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
+
+static int register_fw_dump(struct fadump_mem_struct *fdm)
+{
+	int rc, err;
+	unsigned int wait_time;
+
+	pr_debug("Registering for firmware-assisted kernel dump...\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_REGISTER, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+
+	} while (wait_time);
+
+	err = -EIO;
+	switch (rc) {
+	default:
+		pr_err("Failed to register. Unknown Error(%d).\n", rc);
+		break;
+	case -1:
+		printk(KERN_ERR "Failed to register firmware-assisted kernel"
+			" dump. Hardware Error(%d).\n", rc);
+		break;
+	case -3:
+		if (!is_boot_memory_area_contiguous())
+			pr_err("Can't have holes in boot memory area while "
+			       "registering fadump\n");
+
+		printk(KERN_ERR "Failed to register firmware-assisted kernel"
+			" dump. Parameter Error(%d).\n", rc);
+		err = -EINVAL;
+		break;
+	case -9:
+		printk(KERN_ERR "firmware-assisted kernel dump is already "
+			" registered.");
+		fw_dump.dump_registered = 1;
+		err = -EEXIST;
+		break;
+	case 0:
+		printk(KERN_INFO "firmware-assisted kernel dump registration"
+			" is successful\n");
+		fw_dump.dump_registered = 1;
+		err = 0;
+		break;
+	}
+	return err;
+}
+
+void crash_fadump(struct pt_regs *regs, const char *str)
+{
+	struct fadump_crash_info_header *fdh = NULL;
+	int old_cpu, this_cpu;
+
+	if (!should_fadump_crash())
+		return;
+
+	/*
+	 * old_cpu == -1 means this is the first CPU which has come here,
+	 * go ahead and trigger fadump.
+	 *
+	 * old_cpu != -1 means some other CPU has already on it's way
+	 * to trigger fadump, just keep looping here.
+	 */
+	this_cpu = smp_processor_id();
+	old_cpu = cmpxchg(&crashing_cpu, -1, this_cpu);
+
+	if (old_cpu != -1) {
+		/*
+		 * We can't loop here indefinitely. Wait as long as fadump
+		 * is in force. If we race with fadump un-registration this
+		 * loop will break and then we go down to normal panic path
+		 * and reboot. If fadump is in force the first crashing
+		 * cpu will definitely trigger fadump.
+		 */
+		while (fw_dump.dump_registered)
+			cpu_relax();
+		return;
+	}
+
+	fdh = __va(fw_dump.fadumphdr_addr);
+	fdh->crashing_cpu = crashing_cpu;
+	crash_save_vmcoreinfo();
+
+	if (regs)
+		fdh->regs = *regs;
+	else
+		ppc_save_regs(&fdh->regs);
+
+	fdh->online_mask = *cpu_online_mask;
+
+	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
+	rtas_os_term((char *)str);
+}
+
+#define GPR_MASK	0xffffff0000000000
+static inline int fadump_gpr_index(u64 id)
+{
+	int i = -1;
+	char str[3];
+
+	if ((id & GPR_MASK) == REG_ID("GPR")) {
+		/* get the digits at the end */
+		id &= ~GPR_MASK;
+		id >>= 24;
+		str[2] = '\0';
+		str[1] = id & 0xff;
+		str[0] = (id >> 8) & 0xff;
+		sscanf(str, "%d", &i);
+		if (i > 31)
+			i = -1;
+	}
+	return i;
+}
+
+static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
+								u64 reg_val)
+{
+	int i;
+
+	i = fadump_gpr_index(reg_id);
+	if (i >= 0)
+		regs->gpr[i] = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("NIA"))
+		regs->nip = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("MSR"))
+		regs->msr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CTR"))
+		regs->ctr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("LR"))
+		regs->link = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("XER"))
+		regs->xer = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CR"))
+		regs->ccr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DAR"))
+		regs->dar = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DSISR"))
+		regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct fadump_reg_entry*
+fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) {
+		fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
+					be64_to_cpu(reg_entry->reg_value));
+		reg_entry++;
+	}
+	reg_entry++;
+	return reg_entry;
+}
+
+static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
+{
+	struct elf_prstatus prstatus;
+
+	memset(&prstatus, 0, sizeof(prstatus));
+	/*
+	 * FIXME: How do i get PID? Do I really need it?
+	 * prstatus.pr_pid = ????
+	 */
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+			      &prstatus, sizeof(prstatus));
+	return buf;
+}
+
+static void fadump_update_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/* First note is a place holder for cpu notes info. */
+	phdr = (struct elf_phdr *)bufp;
+
+	if (phdr->p_type == PT_NOTE) {
+		phdr->p_paddr = fw_dump.cpu_notes_buf;
+		phdr->p_offset	= phdr->p_paddr;
+		phdr->p_filesz	= fw_dump.cpu_notes_buf_size;
+		phdr->p_memsz = fw_dump.cpu_notes_buf_size;
+	}
+	return;
+}
+
+static void *fadump_cpu_notes_buf_alloc(unsigned long size)
+{
+	void *vaddr;
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	if (!vaddr)
+		return NULL;
+
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		SetPageReserved(page + i);
+	return vaddr;
+}
+
+static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
+{
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		ClearPageReserved(page + i);
+	__free_pages(page, order);
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
+{
+	struct fadump_reg_save_area_header *reg_header;
+	struct fadump_reg_entry *reg_entry;
+	struct fadump_crash_info_header *fdh = NULL;
+	void *vaddr;
+	unsigned long addr;
+	u32 num_cpus, *note_buf;
+	struct pt_regs regs;
+	int i, rc = 0, cpu = 0;
+
+	if (!fdm->cpu_state_data.bytes_dumped)
+		return -EINVAL;
+
+	addr = be64_to_cpu(fdm->cpu_state_data.destination_address);
+	vaddr = __va(addr);
+
+	reg_header = vaddr;
+	if (be64_to_cpu(reg_header->magic_number) != REGSAVE_AREA_MAGIC) {
+		printk(KERN_ERR "Unable to read register save area.\n");
+		return -ENOENT;
+	}
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
+	pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
+
+	vaddr += be32_to_cpu(reg_header->num_cpu_offset);
+	num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	vaddr += sizeof(u32);
+	reg_entry = (struct fadump_reg_entry *)vaddr;
+
+	/* Allocate buffer to hold cpu crash notes. */
+	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+	fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
+	note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size);
+	if (!note_buf) {
+		printk(KERN_ERR "Failed to allocate 0x%lx bytes for "
+			"cpu notes buffer\n", fw_dump.cpu_notes_buf_size);
+		return -ENOMEM;
+	}
+	fw_dump.cpu_notes_buf = __pa(note_buf);
+
+	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
+			(num_cpus * sizeof(note_buf_t)), note_buf);
+
+	if (fw_dump.fadumphdr_addr)
+		fdh = __va(fw_dump.fadumphdr_addr);
+
+	for (i = 0; i < num_cpus; i++) {
+		if (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUSTRT")) {
+			printk(KERN_ERR "Unable to read CPU state data\n");
+			rc = -ENOENT;
+			goto error_out;
+		}
+		/* Lower 4 bytes of reg_value contains logical cpu id */
+		cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK;
+		if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
+			SKIP_TO_NEXT_CPU(reg_entry);
+			continue;
+		}
+		pr_debug("Reading register data for cpu %d...\n", cpu);
+		if (fdh && fdh->crashing_cpu == cpu) {
+			regs = fdh->regs;
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+			SKIP_TO_NEXT_CPU(reg_entry);
+		} else {
+			reg_entry++;
+			reg_entry = fadump_read_registers(reg_entry, &regs);
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		}
+	}
+	final_note(note_buf);
+
+	if (fdh) {
+		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+							fdh->elfcorehdr_addr);
+		fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
+	}
+	return 0;
+
+error_out:
+	fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
+					fw_dump.cpu_notes_buf_size);
+	fw_dump.cpu_notes_buf = 0;
+	fw_dump.cpu_notes_buf_size = 0;
+	return rc;
+
+}
+
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
+{
+	struct fadump_crash_info_header *fdh;
+	int rc = 0;
+
+	if (!fdm_active || !fw_dump.fadumphdr_addr)
+		return -EINVAL;
+
+	/* Check if the dump data is valid. */
+	if ((be16_to_cpu(fdm_active->header.dump_status_flag) == FADUMP_ERROR_FLAG) ||
+			(fdm_active->cpu_state_data.error_flags != 0) ||
+			(fdm_active->rmr_region.error_flags != 0)) {
+		printk(KERN_ERR "Dump taken by platform is not valid\n");
+		return -EINVAL;
+	}
+	if ((fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) ||
+			!fdm_active->cpu_state_data.bytes_dumped) {
+		printk(KERN_ERR "Dump taken by platform is incomplete\n");
+		return -EINVAL;
+	}
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fw_dump.fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		printk(KERN_ERR "Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	rc = fadump_build_cpu_notes(fdm_active);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return 0;
+}
+
+static void free_crash_memory_ranges(void)
+{
+	kfree(crash_memory_ranges);
+	crash_memory_ranges = NULL;
+	crash_memory_ranges_size = 0;
+	max_crash_mem_ranges = 0;
+}
+
+/*
+ * Allocate or reallocate crash memory ranges array in incremental units
+ * of PAGE_SIZE.
+ */
+static int allocate_crash_memory_ranges(void)
+{
+	struct fad_crash_memory_ranges *new_array;
+	u64 new_size;
+
+	new_size = crash_memory_ranges_size + PAGE_SIZE;
+	pr_debug("Allocating %llu bytes of memory for crash memory ranges\n",
+		 new_size);
+
+	new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL);
+	if (new_array == NULL) {
+		pr_err("Insufficient memory for setting up crash memory ranges\n");
+		free_crash_memory_ranges();
+		return -ENOMEM;
+	}
+
+	crash_memory_ranges = new_array;
+	crash_memory_ranges_size = new_size;
+	max_crash_mem_ranges = (new_size /
+				sizeof(struct fad_crash_memory_ranges));
+	return 0;
+}
+
+static inline int fadump_add_crash_memory(unsigned long long base,
+					  unsigned long long end)
+{
+	u64  start, size;
+	bool is_adjacent = false;
+
+	if (base == end)
+		return 0;
+
+	/*
+	 * Fold adjacent memory ranges to bring down the memory ranges/
+	 * PT_LOAD segments count.
+	 */
+	if (crash_mem_ranges) {
+		start = crash_memory_ranges[crash_mem_ranges - 1].base;
+		size = crash_memory_ranges[crash_mem_ranges - 1].size;
+
+		if ((start + size) == base)
+			is_adjacent = true;
+	}
+	if (!is_adjacent) {
+		/* resize the array on reaching the limit */
+		if (crash_mem_ranges == max_crash_mem_ranges) {
+			int ret;
+
+			ret = allocate_crash_memory_ranges();
+			if (ret)
+				return ret;
+		}
+
+		start = base;
+		crash_memory_ranges[crash_mem_ranges].base = start;
+		crash_mem_ranges++;
+	}
+
+	crash_memory_ranges[crash_mem_ranges - 1].size = (end - start);
+	pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
+		(crash_mem_ranges - 1), start, end - 1, (end - start));
+	return 0;
+}
+
+static int fadump_exclude_reserved_area(unsigned long long start,
+					unsigned long long end)
+{
+	unsigned long long ra_start, ra_end;
+	int ret = 0;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	if ((ra_start < end) && (ra_end > start)) {
+		if ((start < ra_start) && (end > ra_end)) {
+			ret = fadump_add_crash_memory(start, ra_start);
+			if (ret)
+				return ret;
+
+			ret = fadump_add_crash_memory(ra_end, end);
+		} else if (start < ra_start) {
+			ret = fadump_add_crash_memory(start, ra_start);
+		} else if (ra_end < end) {
+			ret = fadump_add_crash_memory(ra_end, end);
+		}
+	} else
+		ret = fadump_add_crash_memory(start, end);
+
+	return ret;
+}
+
+static int fadump_init_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+
+	elf = (struct elfhdr *) bufp;
+	bufp += sizeof(struct elfhdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+#if defined(_CALL_ELF)
+	elf->e_flags = _CALL_ELF;
+#else
+	elf->e_flags = 0;
+#endif
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	return 0;
+}
+
+/*
+ * Traverse through memblock structure and setup crash memory ranges. These
+ * ranges will be used create PT_LOAD program headers in elfcore header.
+ */
+static int fadump_setup_crash_memory_ranges(void)
+{
+	struct memblock_region *reg;
+	unsigned long long start, end;
+	int ret;
+
+	pr_debug("Setup crash memory ranges.\n");
+	crash_mem_ranges = 0;
+
+	/*
+	 * add the first memory chunk (RMA_START through boot_memory_size) as
+	 * a separate memory chunk. The reason is, at the time crash firmware
+	 * will move the content of this memory chunk to different location
+	 * specified during fadump registration. We need to create a separate
+	 * program header for this chunk with the correct offset.
+	 */
+	ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size);
+	if (ret)
+		return ret;
+
+	for_each_memblock(memory, reg) {
+		start = (unsigned long long)reg->base;
+		end = start + (unsigned long long)reg->size;
+
+		/*
+		 * skip the first memory chunk that is already added (RMA_START
+		 * through boot_memory_size). This logic needs a relook if and
+		 * when RMA_START changes to a non-zero value.
+		 */
+		BUILD_BUG_ON(RMA_START != 0);
+		if (start < fw_dump.boot_memory_size) {
+			if (end > fw_dump.boot_memory_size)
+				start = fw_dump.boot_memory_size;
+			else
+				continue;
+		}
+
+		/* add this range excluding the reserved dump area. */
+		ret = fadump_exclude_reserved_area(start, end);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * If the given physical address falls within the boot memory region then
+ * return the relocated address that points to the dump region reserved
+ * for saving initial boot memory contents.
+ */
+static inline unsigned long fadump_relocate(unsigned long paddr)
+{
+	if (paddr > RMA_START && paddr < fw_dump.boot_memory_size)
+		return be64_to_cpu(fdm.rmr_region.destination_address) + paddr;
+	else
+		return paddr;
+}
+
+static int fadump_create_elfcore_headers(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+	int i;
+
+	fadump_init_elfcore_header(bufp);
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/*
+	 * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
+	 * will be populated during second kernel boot after crash. Hence
+	 * this PT_NOTE will always be the first elf note.
+	 *
+	 * NOTE: Any new ELF note addition should be placed after this note.
+	 */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type = PT_NOTE;
+	phdr->p_flags = 0;
+	phdr->p_vaddr = 0;
+	phdr->p_align = 0;
+
+	phdr->p_offset = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = 0;
+	phdr->p_memsz = 0;
+
+	(elf->e_phnum)++;
+
+	/* setup ELF PT_NOTE for vmcoreinfo */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type	= PT_NOTE;
+	phdr->p_flags	= 0;
+	phdr->p_vaddr	= 0;
+	phdr->p_align	= 0;
+
+	phdr->p_paddr	= fadump_relocate(paddr_vmcoreinfo_note());
+	phdr->p_offset	= phdr->p_paddr;
+	phdr->p_memsz	= phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
+
+	/* Increment number of program headers. */
+	(elf->e_phnum)++;
+
+	/* setup PT_LOAD sections. */
+
+	for (i = 0; i < crash_mem_ranges; i++) {
+		unsigned long long mbase, msize;
+		mbase = crash_memory_ranges[i].base;
+		msize = crash_memory_ranges[i].size;
+
+		if (!msize)
+			continue;
+
+		phdr = (struct elf_phdr *)bufp;
+		bufp += sizeof(struct elf_phdr);
+		phdr->p_type	= PT_LOAD;
+		phdr->p_flags	= PF_R|PF_W|PF_X;
+		phdr->p_offset	= mbase;
+
+		if (mbase == RMA_START) {
+			/*
+			 * The entire RMA region will be moved by firmware
+			 * to the specified destination_address. Hence set
+			 * the correct offset.
+			 */
+			phdr->p_offset = be64_to_cpu(fdm.rmr_region.destination_address);
+		}
+
+		phdr->p_paddr = mbase;
+		phdr->p_vaddr = (unsigned long)__va(mbase);
+		phdr->p_filesz = msize;
+		phdr->p_memsz = msize;
+		phdr->p_align = 0;
+
+		/* Increment number of program headers. */
+		(elf->e_phnum)++;
+	}
+	return 0;
+}
+
+static unsigned long init_fadump_header(unsigned long addr)
+{
+	struct fadump_crash_info_header *fdh;
+
+	if (!addr)
+		return 0;
+
+	fw_dump.fadumphdr_addr = addr;
+	fdh = __va(addr);
+	addr += sizeof(struct fadump_crash_info_header);
+
+	memset(fdh, 0, sizeof(struct fadump_crash_info_header));
+	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
+	fdh->elfcorehdr_addr = addr;
+	/* We will set the crashing cpu id in crash_fadump() during crash. */
+	fdh->crashing_cpu = CPU_UNKNOWN;
+
+	return addr;
+}
+
+static int register_fadump(void)
+{
+	unsigned long addr;
+	void *vaddr;
+	int ret;
+
+	/*
+	 * If no memory is reserved then we can not register for firmware-
+	 * assisted dump.
+	 */
+	if (!fw_dump.reserve_dump_area_size)
+		return -ENODEV;
+
+	ret = fadump_setup_crash_memory_ranges();
+	if (ret)
+		return ret;
+
+	addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len);
+	/* Initialize fadump crash info header. */
+	addr = init_fadump_header(addr);
+	vaddr = __va(addr);
+
+	pr_debug("Creating ELF core headers at %#016lx\n", addr);
+	fadump_create_elfcore_headers(vaddr);
+
+	/* register the future kernel dump with firmware. */
+	return register_fw_dump(&fdm);
+}
+
+static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Un-register firmware-assisted dump\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_UNREGISTER, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		printk(KERN_ERR "Failed to un-register firmware-assisted dump."
+			" unexpected error(%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_registered = 0;
+	return 0;
+}
+
+static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Invalidating firmware-assisted dump registration\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_INVALIDATE, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		pr_err("Failed to invalidate firmware-assisted dump registration. Unexpected error (%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_active = 0;
+	fdm_active = NULL;
+	return 0;
+}
+
+void fadump_cleanup(void)
+{
+	/* Invalidate the registration only if dump is active. */
+	if (fw_dump.dump_active) {
+		init_fadump_mem_struct(&fdm,
+			be64_to_cpu(fdm_active->cpu_state_data.destination_address));
+		fadump_invalidate_dump(&fdm);
+	} else if (fw_dump.dump_registered) {
+		/* Un-register Firmware-assisted dump if it was registered. */
+		fadump_unregister_dump(&fdm);
+		free_crash_memory_ranges();
+	}
+}
+
+static void fadump_free_reserved_memory(unsigned long start_pfn,
+					unsigned long end_pfn)
+{
+	unsigned long pfn;
+	unsigned long time_limit = jiffies + HZ;
+
+	pr_info("freeing reserved memory (0x%llx - 0x%llx)\n",
+		PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		free_reserved_page(pfn_to_page(pfn));
+
+		if (time_after(jiffies, time_limit)) {
+			cond_resched();
+			time_limit = jiffies + HZ;
+		}
+	}
+}
+
+/*
+ * Skip memory holes and free memory that was actually reserved.
+ */
+static void fadump_release_reserved_area(unsigned long start, unsigned long end)
+{
+	struct memblock_region *reg;
+	unsigned long tstart, tend;
+	unsigned long start_pfn = PHYS_PFN(start);
+	unsigned long end_pfn = PHYS_PFN(end);
+
+	for_each_memblock(memory, reg) {
+		tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+		tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+		if (tstart < tend) {
+			fadump_free_reserved_memory(tstart, tend);
+
+			if (tend == end_pfn)
+				break;
+
+			start_pfn = tend + 1;
+		}
+	}
+}
+
+/*
+ * Release the memory that was reserved in early boot to preserve the memory
+ * contents. The released memory will be available for general use.
+ */
+static void fadump_release_memory(unsigned long begin, unsigned long end)
+{
+	unsigned long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	/*
+	 * exclude the dump reserve area. Will reuse it for next
+	 * fadump registration.
+	 */
+	if (begin < ra_end && end > ra_start) {
+		if (begin < ra_start)
+			fadump_release_reserved_area(begin, ra_start);
+		if (end > ra_end)
+			fadump_release_reserved_area(ra_end, end);
+	} else
+		fadump_release_reserved_area(begin, end);
+}
+
+static void fadump_invalidate_release_mem(void)
+{
+	unsigned long reserved_area_start, reserved_area_end;
+	unsigned long destination_address;
+
+	mutex_lock(&fadump_mutex);
+	if (!fw_dump.dump_active) {
+		mutex_unlock(&fadump_mutex);
+		return;
+	}
+
+	destination_address = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
+	fadump_cleanup();
+	mutex_unlock(&fadump_mutex);
+
+	/*
+	 * Save the current reserved memory bounds we will require them
+	 * later for releasing the memory for general use.
+	 */
+	reserved_area_start = fw_dump.reserve_dump_area_start;
+	reserved_area_end = reserved_area_start +
+			fw_dump.reserve_dump_area_size;
+	/*
+	 * Setup reserve_dump_area_start and its size so that we can
+	 * reuse this reserved memory for Re-registration.
+	 */
+	fw_dump.reserve_dump_area_start = destination_address;
+	fw_dump.reserve_dump_area_size = get_fadump_area_size();
+
+	fadump_release_memory(reserved_area_start, reserved_area_end);
+	if (fw_dump.cpu_notes_buf) {
+		fadump_cpu_notes_buf_free(
+				(unsigned long)__va(fw_dump.cpu_notes_buf),
+				fw_dump.cpu_notes_buf_size);
+		fw_dump.cpu_notes_buf = 0;
+		fw_dump.cpu_notes_buf_size = 0;
+	}
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+}
+
+static ssize_t fadump_release_memory_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	int input = -1;
+
+	if (!fw_dump.dump_active)
+		return -EPERM;
+
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
+	if (input == 1) {
+		/*
+		 * Take away the '/proc/vmcore'. We are releasing the dump
+		 * memory, hence it will not be valid anymore.
+		 */
+#ifdef CONFIG_PROC_VMCORE
+		vmcore_cleanup();
+#endif
+		fadump_invalidate_release_mem();
+
+	} else
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t fadump_enabled_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%d\n", fw_dump.fadump_enabled);
+}
+
+static ssize_t fadump_register_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%d\n", fw_dump.dump_registered);
+}
+
+static ssize_t fadump_register_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	int ret = 0;
+	int input = -1;
+
+	if (!fw_dump.fadump_enabled || fdm_active)
+		return -EPERM;
+
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
+	mutex_lock(&fadump_mutex);
+
+	switch (input) {
+	case 0:
+		if (fw_dump.dump_registered == 0) {
+			goto unlock_out;
+		}
+		/* Un-register Firmware-assisted dump */
+		fadump_unregister_dump(&fdm);
+		break;
+	case 1:
+		if (fw_dump.dump_registered == 1) {
+			ret = -EEXIST;
+			goto unlock_out;
+		}
+		/* Register Firmware-assisted dump */
+		ret = register_fadump();
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+unlock_out:
+	mutex_unlock(&fadump_mutex);
+	return ret < 0 ? ret : count;
+}
+
+static int fadump_region_show(struct seq_file *m, void *private)
+{
+	const struct fadump_mem_struct *fdm_ptr;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	mutex_lock(&fadump_mutex);
+	if (fdm_active)
+		fdm_ptr = fdm_active;
+	else {
+		mutex_unlock(&fadump_mutex);
+		fdm_ptr = &fdm;
+	}
+
+	seq_printf(m,
+			"CPU : [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address),
+			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) +
+			be64_to_cpu(fdm_ptr->cpu_state_data.source_len) - 1,
+			be64_to_cpu(fdm_ptr->cpu_state_data.source_len),
+			be64_to_cpu(fdm_ptr->cpu_state_data.bytes_dumped));
+	seq_printf(m,
+			"HPTE: [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			be64_to_cpu(fdm_ptr->hpte_region.destination_address),
+			be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
+			be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
+			be64_to_cpu(fdm_ptr->hpte_region.source_len),
+			be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
+	seq_printf(m,
+			"DUMP: [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			be64_to_cpu(fdm_ptr->rmr_region.destination_address),
+			be64_to_cpu(fdm_ptr->rmr_region.destination_address) +
+			be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1,
+			be64_to_cpu(fdm_ptr->rmr_region.source_len),
+			be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
+
+	if (!fdm_active ||
+		(fw_dump.reserve_dump_area_start ==
+		be64_to_cpu(fdm_ptr->cpu_state_data.destination_address)))
+		goto out;
+
+	/* Dump is active. Show reserved memory region. */
+	seq_printf(m,
+			"    : [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			(unsigned long long)fw_dump.reserve_dump_area_start,
+			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - 1,
+			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) -
+			fw_dump.reserve_dump_area_start,
+			be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) -
+			fw_dump.reserve_dump_area_start);
+out:
+	if (fdm_active)
+		mutex_unlock(&fadump_mutex);
+	return 0;
+}
+
+static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem,
+						0200, NULL,
+						fadump_release_memory_store);
+static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled,
+						0444, fadump_enabled_show,
+						NULL);
+static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered,
+						0644, fadump_register_show,
+						fadump_register_store);
+
+static int fadump_region_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fadump_region_show, inode->i_private);
+}
+
+static const struct file_operations fadump_region_fops = {
+	.open    = fadump_region_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+static void fadump_init_files(void)
+{
+	struct dentry *debugfs_file;
+	int rc = 0;
+
+	rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr);
+	if (rc)
+		printk(KERN_ERR "fadump: unable to create sysfs file"
+			" fadump_enabled (%d)\n", rc);
+
+	rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr);
+	if (rc)
+		printk(KERN_ERR "fadump: unable to create sysfs file"
+			" fadump_registered (%d)\n", rc);
+
+	debugfs_file = debugfs_create_file("fadump_region", 0444,
+					powerpc_debugfs_root, NULL,
+					&fadump_region_fops);
+	if (!debugfs_file)
+		printk(KERN_ERR "fadump: unable to create debugfs file"
+				" fadump_region\n");
+
+	if (fw_dump.dump_active) {
+		rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr);
+		if (rc)
+			printk(KERN_ERR "fadump: unable to create sysfs file"
+				" fadump_release_mem (%d)\n", rc);
+	}
+	return;
+}
+
+/*
+ * Prepare for firmware-assisted dump.
+ */
+int __init setup_fadump(void)
+{
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_ERR "Firmware-assisted dump is not supported on"
+			" this hardware\n");
+		return 0;
+	}
+
+	fadump_show_config();
+	/*
+	 * If dump data is available then see if it is valid and prepare for
+	 * saving it to the disk.
+	 */
+	if (fw_dump.dump_active) {
+		/*
+		 * if dump process fails then invalidate the registration
+		 * and release memory before proceeding for re-registration.
+		 */
+		if (process_fadump(fdm_active) < 0)
+			fadump_invalidate_release_mem();
+	}
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	else if (fw_dump.reserve_dump_area_size)
+		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+	fadump_init_files();
+
+	return 1;
+}
+subsys_initcall(setup_fadump);
diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c
new file mode 100644
index 000000000..2eae4478f
--- /dev/null
+++ b/arch/powerpc/kernel/firmware.c
@@ -0,0 +1,22 @@
+/*
+ *  Extracted from cputable.c
+ *
+ *  Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  Modifications for ppc64:
+ *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *  Copyright (C) 2005 Stephen Rothwell, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/cache.h>
+
+#include <asm/firmware.h>
+
+unsigned long powerpc_firmware_features __read_mostly;
+EXPORT_SYMBOL_GPL(powerpc_firmware_features);
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
new file mode 100644
index 000000000..529dcc21c
--- /dev/null
+++ b/arch/powerpc/kernel/fpu.S
@@ -0,0 +1,154 @@
+/*
+ *  FPU support code, moved here from head.S so that it can be used
+ *  by chips which use other head-whatever.S files.
+ *
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *    Copyright (C) 1996 Paul Mackerras.
+ *    Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#ifdef CONFIG_VSX
+#define __REST_32FPVSRS(n,c,base)					\
+BEGIN_FTR_SECTION							\
+	b	2f;							\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
+	REST_32FPRS(n,base);						\
+	b	3f;							\
+2:	REST_32VSRS(n,c,base);						\
+3:
+
+#define __SAVE_32FPVSRS(n,c,base)					\
+BEGIN_FTR_SECTION							\
+	b	2f;							\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
+	SAVE_32FPRS(n,base);						\
+	b	3f;							\
+2:	SAVE_32VSRS(n,c,base);						\
+3:
+#else
+#define __REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
+#define __SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
+#endif
+#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
+#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
+
+/*
+ * Load state from memory into FP registers including FPSCR.
+ * Assumes the caller has enabled FP in the MSR.
+ */
+_GLOBAL(load_fp_state)
+	lfd	fr0,FPSTATE_FPSCR(r3)
+	MTFSF_L(fr0)
+	REST_32FPVSRS(0, R4, R3)
+	blr
+EXPORT_SYMBOL(load_fp_state)
+
+/*
+ * Store FP state into memory, including FPSCR
+ * Assumes the caller has enabled FP in the MSR.
+ */
+_GLOBAL(store_fp_state)
+	SAVE_32FPVSRS(0, R4, R3)
+	mffs	fr0
+	stfd	fr0,FPSTATE_FPSCR(r3)
+	blr
+EXPORT_SYMBOL(store_fp_state)
+
+/*
+ * This task wants to use the FPU now.
+ * On UP, disable FP for the task which had the FPU previously,
+ * and save its floating-point registers in its thread_struct.
+ * Load up this task's FP registers from its thread_struct,
+ * enable the FPU for the current task and return to the task.
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
+ */
+_GLOBAL(load_up_fpu)
+	mfmsr	r5
+	ori	r5,r5,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r5,r5,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	SYNC
+	MTMSRD(r5)			/* enable use of fpu now */
+	isync
+	/* enable use of FP after return */
+#ifdef CONFIG_PPC32
+	mfspr	r5,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
+	lwz	r4,THREAD_FPEXC_MODE(r5)
+	ori	r9,r9,MSR_FP		/* enable FP for current */
+	or	r9,r9,r4
+#else
+	ld	r4,PACACURRENT(r13)
+	addi	r5,r4,THREAD		/* Get THREAD */
+	lwz	r4,THREAD_FPEXC_MODE(r5)
+	ori	r12,r12,MSR_FP
+	or	r12,r12,r4
+	std	r12,_MSR(r1)
+#endif
+	/* Don't care if r4 overflows, this is desired behaviour */
+	lbz	r4,THREAD_LOAD_FP(r5)
+	addi	r4,r4,1
+	stb	r4,THREAD_LOAD_FP(r5)
+	addi	r10,r5,THREAD_FPSTATE
+	lfd	fr0,FPSTATE_FPSCR(r10)
+	MTFSF_L(fr0)
+	REST_32FPVSRS(0, R4, R10)
+	/* restore registers and return */
+	/* we haven't used ctr or xer or lr */
+	blr
+
+/*
+ * save_fpu(tsk)
+ * Save the floating-point registers in its thread_struct.
+ * Enables the FPU for use in the kernel on return.
+ */
+_GLOBAL(save_fpu)
+	addi	r3,r3,THREAD	        /* want THREAD of task */
+	PPC_LL	r6,THREAD_FPSAVEAREA(r3)
+	PPC_LL	r5,PT_REGS(r3)
+	PPC_LCMPI	0,r6,0
+	bne	2f
+	addi	r6,r3,THREAD_FPSTATE
+2:	SAVE_32FPVSRS(0, R4, R6)
+	mffs	fr0
+	stfd	fr0,FPSTATE_FPSCR(r6)
+	blr
+
+/*
+ * These are used in the alignment trap handler when emulating
+ * single-precision loads and stores.
+ */
+
+_GLOBAL(cvt_fd)
+	lfs	0,0(r3)
+	stfd	0,0(r4)
+	blr
+
+_GLOBAL(cvt_df)
+	lfd	0,0(r3)
+	stfs	0,0(r4)
+	blr
diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
new file mode 100644
index 000000000..ea065282b
--- /dev/null
+++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* 1. Find the index of the entry we're executing in */
+	bl	invstr				/* Find our address */
+invstr:	mflr	r6				/* Make it accessible */
+	mfmsr	r7
+	rlwinm	r4,r7,27,31,31			/* extract MSR[IS] */
+	mfspr	r7, SPRN_PID0
+	slwi	r7,r7,16
+	or	r7,r7,r4
+	mtspr	SPRN_MAS6,r7
+	tlbsx	0,r6				/* search MSR[IS], SPID=PID0 */
+	mfspr	r7,SPRN_MAS1
+	andis.	r7,r7,MAS1_VALID@h
+	bne	match_TLB
+
+	mfspr	r7,SPRN_MMUCFG
+	rlwinm	r7,r7,21,28,31			/* extract MMUCFG[NPIDS] */
+	cmpwi	r7,3
+	bne	match_TLB			/* skip if NPIDS != 3 */
+
+	mfspr	r7,SPRN_PID1
+	slwi	r7,r7,16
+	or	r7,r7,r4
+	mtspr	SPRN_MAS6,r7
+	tlbsx	0,r6				/* search MSR[IS], SPID=PID1 */
+	mfspr	r7,SPRN_MAS1
+	andis.	r7,r7,MAS1_VALID@h
+	bne	match_TLB
+	mfspr	r7, SPRN_PID2
+	slwi	r7,r7,16
+	or	r7,r7,r4
+	mtspr	SPRN_MAS6,r7
+	tlbsx	0,r6				/* Fall through, we had to match */
+
+match_TLB:
+	mfspr	r7,SPRN_MAS0
+	rlwinm	r3,r7,16,20,31			/* Extract MAS0(Entry) */
+
+	mfspr	r7,SPRN_MAS1			/* Insure IPROT set */
+	oris	r7,r7,MAS1_IPROT@h
+	mtspr	SPRN_MAS1,r7
+	tlbwe
+
+/* 2. Invalidate all entries except the entry we're executing in */
+	mfspr	r9,SPRN_TLB1CFG
+	andi.	r9,r9,0xfff
+	li	r6,0				/* Set Entry counter to 0 */
+1:	lis	r7,0x1000			/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r7,r6,16,4,15			/* Setup MAS0 = TLBSEL | ESEL(r6) */
+	mtspr	SPRN_MAS0,r7
+	tlbre
+	mfspr	r7,SPRN_MAS1
+	rlwinm	r7,r7,0,2,31			/* Clear MAS1 Valid and IPROT */
+	cmpw	r3,r6
+	beq	skpinv				/* Dont update the current execution TLB */
+	mtspr	SPRN_MAS1,r7
+	tlbwe
+	isync
+skpinv:	addi	r6,r6,1				/* Increment */
+	cmpw	r6,r9				/* Are we done? */
+	bne	1b				/* If not, repeat */
+
+	/* Invalidate TLB0 */
+	li	r6,0x04
+	tlbivax 0,r6
+	TLBSYNC
+	/* Invalidate TLB1 */
+	li	r6,0x0c
+	tlbivax 0,r6
+	TLBSYNC
+
+/* 3. Setup a temp mapping and jump to it */
+	andi.	r5, r3, 0x1	/* Find an entry not used and is non-zero */
+	addi	r5, r5, 0x1
+	lis	r7,0x1000	/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r7,r3,16,4,15	/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r7
+	tlbre
+
+	/* grab and fixup the RPN */
+	mfspr	r6,SPRN_MAS1	/* extract MAS1[SIZE] */
+	rlwinm	r6,r6,25,27,31
+	li	r8,-1
+	addi	r6,r6,10
+	slw	r6,r8,r6	/* convert to mask */
+
+	bl	1f		/* Find our address */
+1:	mflr	r7
+
+	mfspr	r8,SPRN_MAS3
+#ifdef CONFIG_PHYS_64BIT
+	mfspr	r23,SPRN_MAS7
+#endif
+	and	r8,r6,r8
+	subfic	r9,r6,-4096
+	and	r9,r9,r7
+
+	or	r25,r8,r9
+	ori	r8,r25,(MAS3_SX|MAS3_SW|MAS3_SR)
+
+	/* Just modify the entry ID and EPN for the temp mapping */
+	lis	r7,0x1000	/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r7,r5,16,4,15	/* Setup MAS0 = TLBSEL | ESEL(r5) */
+	mtspr	SPRN_MAS0,r7
+	xori	r6,r4,1		/* Setup TMP mapping in the other Address space */
+	slwi	r6,r6,12
+	oris	r6,r6,(MAS1_VALID|MAS1_IPROT)@h
+	ori	r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_4K))@l
+	mtspr	SPRN_MAS1,r6
+	mfspr	r6,SPRN_MAS2
+	li	r7,0		/* temp EPN = 0 */
+	rlwimi	r7,r6,0,20,31
+	mtspr	SPRN_MAS2,r7
+	mtspr	SPRN_MAS3,r8
+	tlbwe
+
+	xori	r6,r4,1
+	slwi	r6,r6,5		/* setup new context with other address space */
+	bl	1f		/* Find our address */
+1:	mflr	r9
+	rlwimi	r7,r9,0,20,31
+	addi	r7,r7,(2f - 1b)
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r6
+	rfi
+2:
+/* 4. Clear out PIDs & Search info */
+	li	r6,0
+	mtspr   SPRN_MAS6,r6
+	mtspr	SPRN_PID0,r6
+
+	mfspr	r7,SPRN_MMUCFG
+	rlwinm	r7,r7,21,28,31			/* extract MMUCFG[NPIDS] */
+	cmpwi	r7,3
+	bne	2f				/* skip if NPIDS != 3 */
+
+	mtspr	SPRN_PID1,r6
+	mtspr	SPRN_PID2,r6
+
+/* 5. Invalidate mapping we started in */
+2:
+	lis	r7,0x1000	/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r7,r3,16,4,15	/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r7
+	tlbre
+	mfspr	r6,SPRN_MAS1
+	rlwinm	r6,r6,0,2,0	/* clear IPROT */
+	mtspr	SPRN_MAS1,r6
+	tlbwe
+	/* Invalidate TLB1 */
+	li	r9,0x0c
+	tlbivax 0,r9
+	TLBSYNC
+
+/*
+ * The mapping only needs to be cache-coherent on SMP, except on
+ * Freescale e500mc derivatives where it's also needed for coherent DMA.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
+#define M_IF_NEEDED	MAS2_M
+#else
+#define M_IF_NEEDED	0
+#endif
+
+#if defined(ENTRY_MAPPING_BOOT_SETUP)
+
+/* 6. Setup KERNELBASE mapping in TLB1[0] */
+	lis	r6,0x1000		/* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */
+	mtspr	SPRN_MAS0,r6
+	lis	r6,(MAS1_VALID|MAS1_IPROT)@h
+	ori	r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l
+	mtspr	SPRN_MAS1,r6
+	lis	r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, M_IF_NEEDED)@h
+	ori	r6,r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, M_IF_NEEDED)@l
+	mtspr	SPRN_MAS2,r6
+	mtspr	SPRN_MAS3,r8
+	tlbwe
+
+/* 7. Jump to KERNELBASE mapping */
+	lis	r6,(KERNELBASE & ~0xfff)@h
+	ori	r6,r6,(KERNELBASE & ~0xfff)@l
+	rlwinm	r7,r25,0,0x03ffffff
+	add	r6,r7,r6
+
+#elif defined(ENTRY_MAPPING_KEXEC_SETUP)
+/*
+ * 6. Setup a 1:1 mapping in TLB1. Esel 0 is unsued, 1 or 2 contains the tmp
+ * mapping so we start at 3. We setup 8 mappings, each 256MiB in size. This
+ * will cover the first 2GiB of memory.
+ */
+
+	lis r10, (MAS1_VALID|MAS1_IPROT)@h
+	ori r10,r10, (MAS1_TSIZE(BOOK3E_PAGESZ_256M))@l
+	li  r11, 0
+	li  r0, 8
+	mtctr   r0
+
+next_tlb_setup:
+	addi	r0, r11, 3
+	rlwinm  r0, r0, 16, 4, 15  // Compute esel
+	rlwinm  r9, r11, 28, 0, 3   // Compute [ER]PN
+	oris    r0, r0, (MAS0_TLBSEL(1))@h
+	mtspr   SPRN_MAS0,r0
+	mtspr   SPRN_MAS1,r10
+	mtspr   SPRN_MAS2,r9
+	ori r9, r9, (MAS3_SX|MAS3_SW|MAS3_SR)
+	mtspr   SPRN_MAS3,r9
+	tlbwe
+	addi    r11, r11, 1
+	bdnz+   next_tlb_setup
+
+/* 7. Jump to our 1:1 mapping */
+	mr	r6, r25
+#else
+	#error You need to specify the mapping or not use this at all.
+#endif
+
+	lis	r7,MSR_KERNEL@h
+	ori	r7,r7,MSR_KERNEL@l
+	bl	1f			/* Find our address */
+1:	mflr	r9
+	rlwimi	r6,r9,0,20,31
+	addi	r6,r6,(2f - 1b)
+	mtspr	SPRN_SRR0,r6
+	mtspr	SPRN_SRR1,r7
+	rfi				/* start execution out of TLB1[0] entry */
+
+/* 8. Clear out the temp mapping */
+2:	lis	r7,0x1000	/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r7,r5,16,4,15	/* Setup MAS0 = TLBSEL | ESEL(r5) */
+	mtspr	SPRN_MAS0,r7
+	tlbre
+	mfspr	r8,SPRN_MAS1
+	rlwinm	r8,r8,0,2,0	/* clear IPROT */
+	mtspr	SPRN_MAS1,r8
+	tlbwe
+	/* Invalidate TLB1 */
+	li	r9,0x0c
+	tlbivax 0,r9
+	TLBSYNC
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
new file mode 100644
index 000000000..61ca27929
--- /dev/null
+++ b/arch/powerpc/kernel/head_32.S
@@ -0,0 +1,1282 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *  MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
+ *
+ *  This file contains the low-level support and setup for the
+ *  PowerPC platform, including trap and interrupt dispatch.
+ *  (The PPC 8xx embedded CPUs use head_8xx.S instead.)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/bug.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+
+/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */
+#define LOAD_BAT(n, reg, RA, RB)	\
+	/* see the comment for clear_bats() -- Cort */ \
+	li	RA,0;			\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	mtspr	SPRN_DBAT##n##U,RA;	\
+	lwz	RA,(n*16)+0(reg);	\
+	lwz	RB,(n*16)+4(reg);	\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	mtspr	SPRN_IBAT##n##L,RB;	\
+	beq	1f;			\
+	lwz	RA,(n*16)+8(reg);	\
+	lwz	RB,(n*16)+12(reg);	\
+	mtspr	SPRN_DBAT##n##U,RA;	\
+	mtspr	SPRN_DBAT##n##L,RB;	\
+1:
+
+	__HEAD
+	.stabs	"arch/powerpc/kernel/",N_SO,0,0,0f
+	.stabs	"head_32.S",N_SO,0,0,0f
+0:
+_ENTRY(_stext);
+
+/*
+ * _start is defined this way because the XCOFF loader in the OpenFirmware
+ * on the powermac expects the entry point to be a procedure descriptor.
+ */
+_ENTRY(_start);
+	/*
+	 * These are here for legacy reasons, the kernel used to
+	 * need to look like a coff function entry for the pmac
+	 * but we're always started by some kind of bootloader now.
+	 *  -- Cort
+	 */
+	nop	/* used by __secondary_hold on prep (mtx) and chrp smp */
+	nop	/* used by __secondary_hold on prep (mtx) and chrp smp */
+	nop
+
+/* PMAC
+ * Enter here with the kernel text, data and bss loaded starting at
+ * 0, running with virtual == physical mapping.
+ * r5 points to the prom entry point (the client interface handler
+ * address).  Address translation is turned on, with the prom
+ * managing the hash table.  Interrupts are disabled.  The stack
+ * pointer (r1) points to just below the end of the half-meg region
+ * from 0x380000 - 0x400000, which is mapped in already.
+ *
+ * If we are booted from MacOS via BootX, we enter with the kernel
+ * image loaded somewhere, and the following values in registers:
+ *  r3: 'BooX' (0x426f6f58)
+ *  r4: virtual address of boot_infos_t
+ *  r5: 0
+ *
+ * PREP
+ * This is jumped to on prep systems right after the kernel is relocated
+ * to its proper place in memory by the boot loader.  The expected layout
+ * of the regs is:
+ *   r3: ptr to residual data
+ *   r4: initrd_start or if no initrd then 0
+ *   r5: initrd_end - unused if r4 is 0
+ *   r6: Start of command line string
+ *   r7: End of command line string
+ *
+ * This just gets a minimal mmu environment setup so we can call
+ * start_here() to do the real work.
+ * -- Cort
+ */
+
+	.globl	__start
+__start:
+/*
+ * We have to do any OF calls before we map ourselves to KERNELBASE,
+ * because OF may have I/O devices mapped into that area
+ * (particularly on CHRP).
+ */
+	cmpwi	0,r5,0
+	beq	1f
+
+#ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
+	/* find out where we are now */
+	bcl	20,31,$+4
+0:	mflr	r8			/* r8 = runtime addr here */
+	addis	r8,r8,(_stext - 0b)@ha
+	addi	r8,r8,(_stext - 0b)@l	/* current runtime base addr */
+	bl	prom_init
+#endif /* CONFIG_PPC_OF_BOOT_TRAMPOLINE */
+
+	/* We never return. We also hit that trap if trying to boot
+	 * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */
+	trap
+
+/*
+ * Check for BootX signature when supporting PowerMac and branch to
+ * appropriate trampoline if it's present
+ */
+#ifdef CONFIG_PPC_PMAC
+1:	lis	r31,0x426f
+	ori	r31,r31,0x6f58
+	cmpw	0,r3,r31
+	bne	1f
+	bl	bootx_init
+	trap
+#endif /* CONFIG_PPC_PMAC */
+
+1:	mr	r31,r3			/* save device tree ptr */
+	li	r24,0			/* cpu # */
+
+/*
+ * early_init() does the early machine identification and does
+ * the necessary low-level setup and clears the BSS
+ *  -- Cort <cort@fsmlabs.com>
+ */
+	bl	early_init
+
+/* Switch MMU off, clear BATs and flush TLB. At this point, r3 contains
+ * the physical address we are running at, returned by early_init()
+ */
+ 	bl	mmu_off
+__after_mmu_off:
+	bl	clear_bats
+	bl	flush_tlbs
+
+	bl	initial_bats
+#if defined(CONFIG_BOOTX_TEXT)
+	bl	setup_disp_bat
+#endif
+#ifdef CONFIG_PPC_EARLY_DEBUG_CPM
+	bl	setup_cpm_bat
+#endif
+#ifdef CONFIG_PPC_EARLY_DEBUG_USBGECKO
+	bl	setup_usbgecko_bat
+#endif
+
+/*
+ * Call setup_cpu for CPU 0 and initialize 6xx Idle
+ */
+	bl	reloc_offset
+	li	r24,0			/* cpu# */
+	bl	call_setup_cpu		/* Call setup_cpu for this CPU */
+#ifdef CONFIG_6xx
+	bl	reloc_offset
+	bl	init_idle_6xx
+#endif /* CONFIG_6xx */
+
+
+/*
+ * We need to run with _start at physical address 0.
+ * On CHRP, we are loaded at 0x10000 since OF on CHRP uses
+ * the exception vectors at 0 (and therefore this copy
+ * overwrites OF's exception vectors with our own).
+ * The MMU is off at this point.
+ */
+	bl	reloc_offset
+	mr	r26,r3
+	addis	r4,r3,KERNELBASE@h	/* current address of _start */
+	lis	r5,PHYSICAL_START@h
+	cmplw	0,r4,r5			/* already running at PHYSICAL_START? */
+	bne	relocate_kernel
+/*
+ * we now have the 1st 16M of ram mapped with the bats.
+ * prep needs the mmu to be turned on here, but pmac already has it on.
+ * this shouldn't bother the pmac since it just gets turned on again
+ * as we jump to our code at KERNELBASE. -- Cort
+ * Actually no, pmac doesn't have it on any more. BootX enters with MMU
+ * off, and in other cases, we now turn it off before changing BATs above.
+ */
+turn_on_mmu:
+	mfmsr	r0
+	ori	r0,r0,MSR_DR|MSR_IR
+	mtspr	SPRN_SRR1,r0
+	lis	r0,start_here@h
+	ori	r0,r0,start_here@l
+	mtspr	SPRN_SRR0,r0
+	SYNC
+	RFI				/* enables MMU */
+
+/*
+ * We need __secondary_hold as a place to hold the other cpus on
+ * an SMP machine, even when we are running a UP kernel.
+ */
+	. = 0xc0			/* for prep bootloader */
+	li	r3,1			/* MTX only has 1 cpu */
+	.globl	__secondary_hold
+__secondary_hold:
+	/* tell the master we're here */
+	stw	r3,__secondary_hold_acknowledge@l(0)
+#ifdef CONFIG_SMP
+100:	lwz	r4,0(0)
+	/* wait until we're told to start */
+	cmpw	0,r4,r3
+	bne	100b
+	/* our cpu # was at addr 0 - go */
+	mr	r24,r3			/* cpu # */
+	b	__secondary_start
+#else
+	b	.
+#endif /* CONFIG_SMP */
+
+	.globl	__secondary_hold_spinloop
+__secondary_hold_spinloop:
+	.long	0
+	.globl	__secondary_hold_acknowledge
+__secondary_hold_acknowledge:
+	.long	-1
+
+/*
+ * Exception entry code.  This code runs with address translation
+ * turned off, i.e. using physical addresses.
+ * We assume sprg3 has the physical address of the current
+ * task's thread_struct.
+ */
+#define EXCEPTION_PROLOG	\
+	mtspr	SPRN_SPRG_SCRATCH0,r10;	\
+	mtspr	SPRN_SPRG_SCRATCH1,r11;	\
+	mfcr	r10;		\
+	EXCEPTION_PROLOG_1;	\
+	EXCEPTION_PROLOG_2
+
+#define EXCEPTION_PROLOG_1	\
+	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel */ \
+	andi.	r11,r11,MSR_PR;	\
+	tophys(r11,r1);			/* use tophys(r1) if kernel */ \
+	beq	1f;		\
+	mfspr	r11,SPRN_SPRG_THREAD;	\
+	lwz	r11,THREAD_INFO-THREAD(r11);	\
+	addi	r11,r11,THREAD_SIZE;	\
+	tophys(r11,r11);	\
+1:	subi	r11,r11,INT_FRAME_SIZE	/* alloc exc. frame */
+
+
+#define EXCEPTION_PROLOG_2	\
+	stw	r10,_CCR(r11);		/* save registers */ \
+	stw	r12,GPR12(r11);	\
+	stw	r9,GPR9(r11);	\
+	mfspr	r10,SPRN_SPRG_SCRATCH0;	\
+	stw	r10,GPR10(r11);	\
+	mfspr	r12,SPRN_SPRG_SCRATCH1;	\
+	stw	r12,GPR11(r11);	\
+	mflr	r10;		\
+	stw	r10,_LINK(r11);	\
+	mfspr	r12,SPRN_SRR0;	\
+	mfspr	r9,SPRN_SRR1;	\
+	stw	r1,GPR1(r11);	\
+	stw	r1,0(r11);	\
+	tovirt(r1,r11);			/* set new kernel sp */	\
+	li	r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \
+	MTMSRD(r10);			/* (except for mach check in rtas) */ \
+	stw	r0,GPR0(r11);	\
+	lis	r10,STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */ \
+	addi	r10,r10,STACK_FRAME_REGS_MARKER@l; \
+	stw	r10,8(r11);	\
+	SAVE_4GPRS(3, r11);	\
+	SAVE_2GPRS(7, r11)
+
+/*
+ * Note: code which follows this uses cr0.eq (set if from kernel),
+ * r11, r12 (SRR0), and r9 (SRR1).
+ *
+ * Note2: once we have set r1 we are in a position to take exceptions
+ * again, and we could thus set MSR:RI at that point.
+ */
+
+/*
+ * Exception vectors.
+ */
+#define EXCEPTION(n, label, hdlr, xfer)		\
+	. = n;					\
+	DO_KVM n;				\
+label:						\
+	EXCEPTION_PROLOG;			\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
+	xfer(n, hdlr)
+
+#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
+	li	r10,trap;					\
+	stw	r10,_TRAP(r11);					\
+	li	r10,MSR_KERNEL;					\
+	copyee(r10, r9);					\
+	bl	tfer;						\
+i##n:								\
+	.long	hdlr;						\
+	.long	ret
+
+#define COPY_EE(d, s)		rlwimi d,s,0,16,16
+#define NOCOPY(d, s)
+
+#define EXC_XFER_STD(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full,	\
+			  ret_from_except_full)
+
+#define EXC_XFER_LITE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
+#define EXC_XFER_EE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_EE_LITE(n, hdlr)	\
+	EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \
+			  ret_from_except)
+
+/* System reset */
+/* core99 pmac starts the seconary here by changing the vector, and
+   putting it back to what it was (unknown_exception) when done.  */
+	EXCEPTION(0x100, Reset, unknown_exception, EXC_XFER_STD)
+
+/* Machine check */
+/*
+ * On CHRP, this is complicated by the fact that we could get a
+ * machine check inside RTAS, and we have no guarantee that certain
+ * critical registers will have the values we expect.  The set of
+ * registers that might have bad values includes all the GPRs
+ * and all the BATs.  We indicate that we are in RTAS by putting
+ * a non-zero value, the address of the exception frame to use,
+ * in SPRG2.  The machine check handler checks SPRG2 and uses its
+ * value if it is non-zero.  If we ever needed to free up SPRG2,
+ * we could use a field in the thread_info or thread_struct instead.
+ * (Other exception handlers assume that r1 is a valid kernel stack
+ * pointer when we take an exception from supervisor mode.)
+ *	-- paulus.
+ */
+	. = 0x200
+	DO_KVM  0x200
+	mtspr	SPRN_SPRG_SCRATCH0,r10
+	mtspr	SPRN_SPRG_SCRATCH1,r11
+	mfcr	r10
+#ifdef CONFIG_PPC_CHRP
+	mfspr	r11,SPRN_SPRG_RTAS
+	cmpwi	0,r11,0
+	bne	7f
+#endif /* CONFIG_PPC_CHRP */
+	EXCEPTION_PROLOG_1
+7:	EXCEPTION_PROLOG_2
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_CHRP
+	mfspr	r4,SPRN_SPRG_RTAS
+	cmpwi	cr1,r4,0
+	bne	cr1,1f
+#endif
+	EXC_XFER_STD(0x200, machine_check_exception)
+#ifdef CONFIG_PPC_CHRP
+1:	b	machine_check_in_rtas
+#endif
+
+/* Data access exception. */
+	. = 0x300
+	DO_KVM  0x300
+DataAccess:
+	EXCEPTION_PROLOG
+	mfspr	r10,SPRN_DSISR
+	stw	r10,_DSISR(r11)
+	andis.	r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h
+	bne	1f			/* if not, try to put a PTE */
+	mfspr	r4,SPRN_DAR		/* into the hash table */
+	rlwinm	r3,r10,32-15,21,21	/* DSISR_STORE -> _PAGE_RW */
+	bl	hash_page
+1:	lwz	r5,_DSISR(r11)		/* get DSISR value */
+	mfspr	r4,SPRN_DAR
+	EXC_XFER_LITE(0x300, handle_page_fault)
+
+
+/* Instruction access exception. */
+	. = 0x400
+	DO_KVM  0x400
+InstructionAccess:
+	EXCEPTION_PROLOG
+	andis.	r0,r9,SRR1_ISI_NOPT@h	/* no pte found? */
+	beq	1f			/* if so, try to put a PTE */
+	li	r3,0			/* into the hash table */
+	mr	r4,r12			/* SRR0 is fault address */
+	bl	hash_page
+1:	mr	r4,r12
+	andis.	r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
+	EXC_XFER_LITE(0x400, handle_page_fault)
+
+/* External interrupt */
+	EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
+
+/* Alignment exception */
+	. = 0x600
+	DO_KVM  0x600
+Alignment:
+	EXCEPTION_PROLOG
+	mfspr	r4,SPRN_DAR
+	stw	r4,_DAR(r11)
+	mfspr	r5,SPRN_DSISR
+	stw	r5,_DSISR(r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE(0x600, alignment_exception)
+
+/* Program check exception */
+	EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD)
+
+/* Floating-point unavailable */
+	. = 0x800
+	DO_KVM  0x800
+FPUnavailable:
+BEGIN_FTR_SECTION
+/*
+ * Certain Freescale cores don't have a FPU and treat fp instructions
+ * as a FP Unavailable exception.  Redirect to illegal/emulation handling.
+ */
+	b 	ProgramCheck
+END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE)
+	EXCEPTION_PROLOG
+	beq	1f
+	bl	load_up_fpu		/* if from user, just load it up */
+	b	fast_exception_return
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception)
+
+/* Decrementer */
+	EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE)
+
+	EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_EE)
+
+/* System call */
+	. = 0xc00
+	DO_KVM  0xc00
+SystemCall:
+	EXCEPTION_PROLOG
+	EXC_XFER_EE_LITE(0xc00, DoSyscall)
+
+/* Single step - not used on 601 */
+	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
+	EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_EE)
+
+/*
+ * The Altivec unavailable trap is at 0x0f20.  Foo.
+ * We effectively remap it to 0x3000.
+ * We include an altivec unavailable exception vector even if
+ * not configured for Altivec, so that you can't panic a
+ * non-altivec kernel running on a machine with altivec just
+ * by executing an altivec instruction.
+ */
+	. = 0xf00
+	DO_KVM  0xf00
+	b	PerformanceMonitor
+
+	. = 0xf20
+	DO_KVM  0xf20
+	b	AltiVecUnavailable
+
+/*
+ * Handle TLB miss for instruction on 603/603e.
+ * Note: we get an alternate set of r0 - r3 to use automatically.
+ */
+	. = 0x1000
+InstructionTLBMiss:
+/*
+ * r0:	scratch
+ * r1:	linux style pte ( later becomes ppc hardware pte )
+ * r2:	ptr to linux-style pte
+ * r3:	scratch
+ */
+	/* Get PTE (linux-style) and check access */
+	mfspr	r3,SPRN_IMISS
+	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	cmplw	0,r1,r3
+	mfspr	r2,SPRN_SPRG_THREAD
+	li	r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */
+	lwz	r2,PGDIR(r2)
+	bge-	112f
+	mfspr	r2,SPRN_SRR1		/* and MSR_PR bit from SRR1 */
+	rlwimi	r1,r2,32-12,29,29	/* shift MSR_PR to _PAGE_USER posn */
+	lis	r2,swapper_pg_dir@ha	/* if kernel address, use */
+	addi	r2,r2,swapper_pg_dir@l	/* kernel page table */
+112:	tophys(r2,r2)
+	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
+	lwz	r2,0(r2)		/* get pmd entry */
+	rlwinm.	r2,r2,0,0,19		/* extract address of pte page */
+	beq-	InstructionAddressInvalid	/* return if no mapping */
+	rlwimi	r2,r3,22,20,29		/* insert next 10 bits of address */
+	lwz	r0,0(r2)		/* get linux-style pte */
+	andc.	r1,r1,r0		/* check access & ~permission */
+	bne-	InstructionAddressInvalid /* return if access not permitted */
+	ori	r0,r0,_PAGE_ACCESSED	/* set _PAGE_ACCESSED in pte */
+	/*
+	 * NOTE! We are assuming this is not an SMP system, otherwise
+	 * we would need to update the pte atomically with lwarx/stwcx.
+	 */
+	stw	r0,0(r2)		/* update PTE (accessed bit) */
+	/* Convert linux-style PTE to low word of PPC-style PTE */
+	rlwinm	r1,r0,32-10,31,31	/* _PAGE_RW -> PP lsb */
+	rlwinm	r2,r0,32-7,31,31	/* _PAGE_DIRTY -> PP lsb */
+	and	r1,r1,r2		/* writable if _RW and _DIRTY */
+	rlwimi	r0,r0,32-1,30,30	/* _PAGE_USER -> PP msb */
+	rlwimi	r0,r0,32-1,31,31	/* _PAGE_USER -> PP lsb */
+	ori	r1,r1,0xe04		/* clear out reserved bits */
+	andc	r1,r0,r1		/* PP = user? (rw&dirty? 2: 3): 0 */
+BEGIN_FTR_SECTION
+	rlwinm	r1,r1,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
+	mtspr	SPRN_RPA,r1
+	tlbli	r3
+	mfspr	r3,SPRN_SRR1		/* Need to restore CR0 */
+	mtcrf	0x80,r3
+	rfi
+InstructionAddressInvalid:
+	mfspr	r3,SPRN_SRR1
+	rlwinm	r1,r3,9,6,6	/* Get load/store bit */
+
+	addis	r1,r1,0x2000
+	mtspr	SPRN_DSISR,r1	/* (shouldn't be needed) */
+	andi.	r2,r3,0xFFFF	/* Clear upper bits of SRR1 */
+	or	r2,r2,r1
+	mtspr	SPRN_SRR1,r2
+	mfspr	r1,SPRN_IMISS	/* Get failing address */
+	rlwinm.	r2,r2,0,31,31	/* Check for little endian access */
+	rlwimi	r2,r2,1,30,30	/* change 1 -> 3 */
+	xor	r1,r1,r2
+	mtspr	SPRN_DAR,r1	/* Set fault address */
+	mfmsr	r0		/* Restore "normal" registers */
+	xoris	r0,r0,MSR_TGPR>>16
+	mtcrf	0x80,r3		/* Restore CR0 */
+	mtmsr	r0
+	b	InstructionAccess
+
+/*
+ * Handle TLB miss for DATA Load operation on 603/603e
+ */
+	. = 0x1100
+DataLoadTLBMiss:
+/*
+ * r0:	scratch
+ * r1:	linux style pte ( later becomes ppc hardware pte )
+ * r2:	ptr to linux-style pte
+ * r3:	scratch
+ */
+	/* Get PTE (linux-style) and check access */
+	mfspr	r3,SPRN_DMISS
+	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	cmplw	0,r1,r3
+	mfspr	r2,SPRN_SPRG_THREAD
+	li	r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */
+	lwz	r2,PGDIR(r2)
+	bge-	112f
+	mfspr	r2,SPRN_SRR1		/* and MSR_PR bit from SRR1 */
+	rlwimi	r1,r2,32-12,29,29	/* shift MSR_PR to _PAGE_USER posn */
+	lis	r2,swapper_pg_dir@ha	/* if kernel address, use */
+	addi	r2,r2,swapper_pg_dir@l	/* kernel page table */
+112:	tophys(r2,r2)
+	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
+	lwz	r2,0(r2)		/* get pmd entry */
+	rlwinm.	r2,r2,0,0,19		/* extract address of pte page */
+	beq-	DataAddressInvalid	/* return if no mapping */
+	rlwimi	r2,r3,22,20,29		/* insert next 10 bits of address */
+	lwz	r0,0(r2)		/* get linux-style pte */
+	andc.	r1,r1,r0		/* check access & ~permission */
+	bne-	DataAddressInvalid	/* return if access not permitted */
+	ori	r0,r0,_PAGE_ACCESSED	/* set _PAGE_ACCESSED in pte */
+	/*
+	 * NOTE! We are assuming this is not an SMP system, otherwise
+	 * we would need to update the pte atomically with lwarx/stwcx.
+	 */
+	stw	r0,0(r2)		/* update PTE (accessed bit) */
+	/* Convert linux-style PTE to low word of PPC-style PTE */
+	rlwinm	r1,r0,32-10,31,31	/* _PAGE_RW -> PP lsb */
+	rlwinm	r2,r0,32-7,31,31	/* _PAGE_DIRTY -> PP lsb */
+	and	r1,r1,r2		/* writable if _RW and _DIRTY */
+	rlwimi	r0,r0,32-1,30,30	/* _PAGE_USER -> PP msb */
+	rlwimi	r0,r0,32-1,31,31	/* _PAGE_USER -> PP lsb */
+	ori	r1,r1,0xe04		/* clear out reserved bits */
+	andc	r1,r0,r1		/* PP = user? (rw&dirty? 2: 3): 0 */
+BEGIN_FTR_SECTION
+	rlwinm	r1,r1,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
+	mtspr	SPRN_RPA,r1
+	mfspr	r2,SPRN_SRR1		/* Need to restore CR0 */
+	mtcrf	0x80,r2
+BEGIN_MMU_FTR_SECTION
+	li	r0,1
+	mfspr	r1,SPRN_SPRG_603_LRU
+	rlwinm	r2,r3,20,27,31		/* Get Address bits 15:19 */
+	slw	r0,r0,r2
+	xor	r1,r0,r1
+	srw	r0,r1,r2
+	mtspr   SPRN_SPRG_603_LRU,r1
+	mfspr	r2,SPRN_SRR1
+	rlwimi	r2,r0,31-14,14,14
+	mtspr   SPRN_SRR1,r2
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
+	tlbld	r3
+	rfi
+DataAddressInvalid:
+	mfspr	r3,SPRN_SRR1
+	rlwinm	r1,r3,9,6,6	/* Get load/store bit */
+	addis	r1,r1,0x2000
+	mtspr	SPRN_DSISR,r1
+	andi.	r2,r3,0xFFFF	/* Clear upper bits of SRR1 */
+	mtspr	SPRN_SRR1,r2
+	mfspr	r1,SPRN_DMISS	/* Get failing address */
+	rlwinm.	r2,r2,0,31,31	/* Check for little endian access */
+	beq	20f		/* Jump if big endian */
+	xori	r1,r1,3
+20:	mtspr	SPRN_DAR,r1	/* Set fault address */
+	mfmsr	r0		/* Restore "normal" registers */
+	xoris	r0,r0,MSR_TGPR>>16
+	mtcrf	0x80,r3		/* Restore CR0 */
+	mtmsr	r0
+	b	DataAccess
+
+/*
+ * Handle TLB miss for DATA Store on 603/603e
+ */
+	. = 0x1200
+DataStoreTLBMiss:
+/*
+ * r0:	scratch
+ * r1:	linux style pte ( later becomes ppc hardware pte )
+ * r2:	ptr to linux-style pte
+ * r3:	scratch
+ */
+	/* Get PTE (linux-style) and check access */
+	mfspr	r3,SPRN_DMISS
+	lis	r1,PAGE_OFFSET@h		/* check if kernel address */
+	cmplw	0,r1,r3
+	mfspr	r2,SPRN_SPRG_THREAD
+	li	r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */
+	lwz	r2,PGDIR(r2)
+	bge-	112f
+	mfspr	r2,SPRN_SRR1		/* and MSR_PR bit from SRR1 */
+	rlwimi	r1,r2,32-12,29,29	/* shift MSR_PR to _PAGE_USER posn */
+	lis	r2,swapper_pg_dir@ha	/* if kernel address, use */
+	addi	r2,r2,swapper_pg_dir@l	/* kernel page table */
+112:	tophys(r2,r2)
+	rlwimi	r2,r3,12,20,29		/* insert top 10 bits of address */
+	lwz	r2,0(r2)		/* get pmd entry */
+	rlwinm.	r2,r2,0,0,19		/* extract address of pte page */
+	beq-	DataAddressInvalid	/* return if no mapping */
+	rlwimi	r2,r3,22,20,29		/* insert next 10 bits of address */
+	lwz	r0,0(r2)		/* get linux-style pte */
+	andc.	r1,r1,r0		/* check access & ~permission */
+	bne-	DataAddressInvalid	/* return if access not permitted */
+	ori	r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY
+	/*
+	 * NOTE! We are assuming this is not an SMP system, otherwise
+	 * we would need to update the pte atomically with lwarx/stwcx.
+	 */
+	stw	r0,0(r2)		/* update PTE (accessed/dirty bits) */
+	/* Convert linux-style PTE to low word of PPC-style PTE */
+	rlwimi	r0,r0,32-1,30,30	/* _PAGE_USER -> PP msb */
+	li	r1,0xe05		/* clear out reserved bits & PP lsb */
+	andc	r1,r0,r1		/* PP = user? 2: 0 */
+BEGIN_FTR_SECTION
+	rlwinm	r1,r1,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
+	mtspr	SPRN_RPA,r1
+	mfspr	r2,SPRN_SRR1		/* Need to restore CR0 */
+	mtcrf	0x80,r2
+BEGIN_MMU_FTR_SECTION
+	li	r0,1
+	mfspr	r1,SPRN_SPRG_603_LRU
+	rlwinm	r2,r3,20,27,31		/* Get Address bits 15:19 */
+	slw	r0,r0,r2
+	xor	r1,r0,r1
+	srw	r0,r1,r2
+	mtspr   SPRN_SPRG_603_LRU,r1
+	mfspr	r2,SPRN_SRR1
+	rlwimi	r2,r0,31-14,14,14
+	mtspr   SPRN_SRR1,r2
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
+	tlbld	r3
+	rfi
+
+#ifndef CONFIG_ALTIVEC
+#define altivec_assist_exception	unknown_exception
+#endif
+
+	EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_EE)
+	EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_EE)
+	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_EE)
+	EXCEPTION(0x1700, Trap_17, TAUException, EXC_XFER_STD)
+	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_EE)
+	EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE)
+
+	. = 0x3000
+
+AltiVecUnavailable:
+	EXCEPTION_PROLOG
+#ifdef CONFIG_ALTIVEC
+	beq	1f
+	bl	load_up_altivec		/* if from user, just load it up */
+	b	fast_exception_return
+#endif /* CONFIG_ALTIVEC */
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE_LITE(0xf20, altivec_unavailable_exception)
+
+PerformanceMonitor:
+	EXCEPTION_PROLOG
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_STD(0xf00, performance_monitor_exception)
+
+
+/*
+ * This code is jumped to from the startup code to copy
+ * the kernel image to physical address PHYSICAL_START.
+ */
+relocate_kernel:
+	addis	r9,r26,klimit@ha	/* fetch klimit */
+	lwz	r25,klimit@l(r9)
+	addis	r25,r25,-KERNELBASE@h
+	lis	r3,PHYSICAL_START@h	/* Destination base address */
+	li	r6,0			/* Destination offset */
+	li	r5,0x4000		/* # bytes of memory to copy */
+	bl	copy_and_flush		/* copy the first 0x4000 bytes */
+	addi	r0,r3,4f@l		/* jump to the address of 4f */
+	mtctr	r0			/* in copy and do the rest. */
+	bctr				/* jump to the copy */
+4:	mr	r5,r25
+	bl	copy_and_flush		/* copy the rest */
+	b	turn_on_mmu
+
+/*
+ * Copy routine used to copy the kernel to start at physical address 0
+ * and flush and invalidate the caches as needed.
+ * r3 = dest addr, r4 = source addr, r5 = copy limit, r6 = start offset
+ * on exit, r3, r4, r5 are unchanged, r6 is updated to be >= r5.
+ */
+_ENTRY(copy_and_flush)
+	addi	r5,r5,-4
+	addi	r6,r6,-4
+4:	li	r0,L1_CACHE_BYTES/4
+	mtctr	r0
+3:	addi	r6,r6,4			/* copy a cache line */
+	lwzx	r0,r6,r4
+	stwx	r0,r6,r3
+	bdnz	3b
+	dcbst	r6,r3			/* write it to memory */
+	sync
+	icbi	r6,r3			/* flush the icache line */
+	cmplw	0,r6,r5
+	blt	4b
+	sync				/* additional sync needed on g4 */
+	isync
+	addi	r5,r5,4
+	addi	r6,r6,4
+	blr
+
+#ifdef CONFIG_SMP
+	.globl __secondary_start_mpc86xx
+__secondary_start_mpc86xx:
+	mfspr	r3, SPRN_PIR
+	stw	r3, __secondary_hold_acknowledge@l(0)
+	mr	r24, r3			/* cpu # */
+	b	__secondary_start
+
+	.globl	__secondary_start_pmac_0
+__secondary_start_pmac_0:
+	/* NB the entries for cpus 0, 1, 2 must each occupy 8 bytes. */
+	li	r24,0
+	b	1f
+	li	r24,1
+	b	1f
+	li	r24,2
+	b	1f
+	li	r24,3
+1:
+	/* on powersurge, we come in here with IR=0 and DR=1, and DBAT 0
+	   set to map the 0xf0000000 - 0xffffffff region */
+	mfmsr	r0
+	rlwinm	r0,r0,0,28,26		/* clear DR (0x10) */
+	SYNC
+	mtmsr	r0
+	isync
+
+	.globl	__secondary_start
+__secondary_start:
+	/* Copy some CPU settings from CPU 0 */
+	bl	__restore_cpu_setup
+
+	lis	r3,-KERNELBASE@h
+	mr	r4,r24
+	bl	call_setup_cpu		/* Call setup_cpu for this CPU */
+#ifdef CONFIG_6xx
+	lis	r3,-KERNELBASE@h
+	bl	init_idle_6xx
+#endif /* CONFIG_6xx */
+
+	/* get current_thread_info and current */
+	lis	r1,secondary_ti@ha
+	tophys(r1,r1)
+	lwz	r1,secondary_ti@l(r1)
+	tophys(r2,r1)
+	lwz	r2,TI_TASK(r2)
+
+	/* stack */
+	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	li	r0,0
+	tophys(r3,r1)
+	stw	r0,0(r3)
+
+	/* load up the MMU */
+	bl	load_up_mmu
+
+	/* ptr to phys current thread */
+	tophys(r4,r2)
+	addi	r4,r4,THREAD	/* phys address of our thread_struct */
+	mtspr	SPRN_SPRG_THREAD,r4
+	li	r3,0
+	mtspr	SPRN_SPRG_RTAS,r3	/* 0 => not in RTAS */
+
+	/* enable MMU and jump to start_secondary */
+	li	r4,MSR_KERNEL
+	lis	r3,start_secondary@h
+	ori	r3,r3,start_secondary@l
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	SYNC
+	RFI
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+#include "../kvm/book3s_rmhandlers.S"
+#endif
+
+/*
+ * Those generic dummy functions are kept for CPUs not
+ * included in CONFIG_6xx
+ */
+#if !defined(CONFIG_6xx)
+_ENTRY(__save_cpu_setup)
+	blr
+_ENTRY(__restore_cpu_setup)
+	blr
+#endif /* !defined(CONFIG_6xx) */
+
+
+/*
+ * Load stuff into the MMU.  Intended to be called with
+ * IR=0 and DR=0.
+ */
+load_up_mmu:
+	sync			/* Force all PTE updates to finish */
+	isync
+	tlbia			/* Clear all TLB entries */
+	sync			/* wait for tlbia/tlbie to finish */
+	TLBSYNC			/* ... on all CPUs */
+	/* Load the SDR1 register (hash table base & size) */
+	lis	r6,_SDR1@ha
+	tophys(r6,r6)
+	lwz	r6,_SDR1@l(r6)
+	mtspr	SPRN_SDR1,r6
+	li	r0,16		/* load up segment register values */
+	mtctr	r0		/* for context 0 */
+	lis	r3,0x2000	/* Ku = 1, VSID = 0 */
+	li	r4,0
+3:	mtsrin	r3,r4
+	addi	r3,r3,0x111	/* increment VSID */
+	addis	r4,r4,0x1000	/* address of next segment */
+	bdnz	3b
+
+/* Load the BAT registers with the values set up by MMU_init.
+   MMU_init takes care of whether we're on a 601 or not. */
+	mfpvr	r3
+	srwi	r3,r3,16
+	cmpwi	r3,1
+	lis	r3,BATS@ha
+	addi	r3,r3,BATS@l
+	tophys(r3,r3)
+	LOAD_BAT(0,r3,r4,r5)
+	LOAD_BAT(1,r3,r4,r5)
+	LOAD_BAT(2,r3,r4,r5)
+	LOAD_BAT(3,r3,r4,r5)
+BEGIN_MMU_FTR_SECTION
+	LOAD_BAT(4,r3,r4,r5)
+	LOAD_BAT(5,r3,r4,r5)
+	LOAD_BAT(6,r3,r4,r5)
+	LOAD_BAT(7,r3,r4,r5)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+	blr
+
+/*
+ * This is where the main kernel code starts.
+ */
+start_here:
+	/* ptr to current */
+	lis	r2,init_task@h
+	ori	r2,r2,init_task@l
+	/* Set up for using our exception vectors */
+	/* ptr to phys current thread */
+	tophys(r4,r2)
+	addi	r4,r4,THREAD	/* init task's THREAD */
+	mtspr	SPRN_SPRG_THREAD,r4
+	li	r3,0
+	mtspr	SPRN_SPRG_RTAS,r3	/* 0 => not in RTAS */
+
+	/* stack */
+	lis	r1,init_thread_union@ha
+	addi	r1,r1,init_thread_union@l
+	li	r0,0
+	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
+/*
+ * Do early platform-specific initialization,
+ * and set up the MMU.
+ */
+	li	r3,0
+	mr	r4,r31
+	bl	machine_init
+	bl	__save_cpu_setup
+	bl	MMU_init
+
+/*
+ * Go back to running unmapped so we can load up new values
+ * for SDR1 (hash table pointer) and the segment registers
+ * and change to using our exception vectors.
+ */
+	lis	r4,2f@h
+	ori	r4,r4,2f@l
+	tophys(r4,r4)
+	li	r3,MSR_KERNEL & ~(MSR_IR|MSR_DR)
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	SYNC
+	RFI
+/* Load up the kernel context */
+2:	bl	load_up_mmu
+
+#ifdef CONFIG_BDI_SWITCH
+	/* Add helper information for the Abatron bdiGDB debugger.
+	 * We do this here because we know the mmu is disabled, and
+	 * will be enabled for real in just a few instructions.
+	 */
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r5, 0xf0(r0)	/* This much match your Abatron config */
+	lis	r6, swapper_pg_dir@h
+	ori	r6, r6, swapper_pg_dir@l
+	tophys(r5, r5)
+	stw	r6, 0(r5)
+#endif /* CONFIG_BDI_SWITCH */
+
+/* Now turn on the MMU for real! */
+	li	r4,MSR_KERNEL
+	lis	r3,start_kernel@h
+	ori	r3,r3,start_kernel@l
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	SYNC
+	RFI
+
+/*
+ * void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
+ *
+ * Set up the segment registers for a new context.
+ */
+_ENTRY(switch_mmu_context)
+	lwz	r3,MMCONTEXTID(r4)
+	cmpwi	cr0,r3,0
+	blt-	4f
+	mulli	r3,r3,897	/* multiply context by skew factor */
+	rlwinm	r3,r3,4,8,27	/* VSID = (context & 0xfffff) << 4 */
+	addis	r3,r3,0x6000	/* Set Ks, Ku bits */
+	li	r0,NUM_USER_SEGMENTS
+	mtctr	r0
+
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is passed as second argument.
+	 */
+	lwz	r4,MM_PGD(r4)
+	lis	r5, KERNELBASE@h
+	lwz	r5, 0xf0(r5)
+	stw	r4, 0x4(r5)
+#endif
+	li	r4,0
+	isync
+3:
+	mtsrin	r3,r4
+	addi	r3,r3,0x111	/* next VSID */
+	rlwinm	r3,r3,0,8,3	/* clear out any overflow from VSID field */
+	addis	r4,r4,0x1000	/* address of next segment */
+	bdnz	3b
+	sync
+	isync
+	blr
+4:	trap
+	EMIT_BUG_ENTRY 4b,__FILE__,__LINE__,0
+	blr
+EXPORT_SYMBOL(switch_mmu_context)
+
+/*
+ * An undocumented "feature" of 604e requires that the v bit
+ * be cleared before changing BAT values.
+ *
+ * Also, newer IBM firmware does not clear bat3 and 4 so
+ * this makes sure it's done.
+ *  -- Cort
+ */
+clear_bats:
+	li	r10,0
+	mfspr	r9,SPRN_PVR
+	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
+	cmpwi	r9, 1
+	beq	1f
+
+	mtspr	SPRN_DBAT0U,r10
+	mtspr	SPRN_DBAT0L,r10
+	mtspr	SPRN_DBAT1U,r10
+	mtspr	SPRN_DBAT1L,r10
+	mtspr	SPRN_DBAT2U,r10
+	mtspr	SPRN_DBAT2L,r10
+	mtspr	SPRN_DBAT3U,r10
+	mtspr	SPRN_DBAT3L,r10
+1:
+	mtspr	SPRN_IBAT0U,r10
+	mtspr	SPRN_IBAT0L,r10
+	mtspr	SPRN_IBAT1U,r10
+	mtspr	SPRN_IBAT1L,r10
+	mtspr	SPRN_IBAT2U,r10
+	mtspr	SPRN_IBAT2L,r10
+	mtspr	SPRN_IBAT3U,r10
+	mtspr	SPRN_IBAT3L,r10
+BEGIN_MMU_FTR_SECTION
+	/* Here's a tweak: at this point, CPU setup have
+	 * not been called yet, so HIGH_BAT_EN may not be
+	 * set in HID0 for the 745x processors. However, it
+	 * seems that doesn't affect our ability to actually
+	 * write to these SPRs.
+	 */
+	mtspr	SPRN_DBAT4U,r10
+	mtspr	SPRN_DBAT4L,r10
+	mtspr	SPRN_DBAT5U,r10
+	mtspr	SPRN_DBAT5L,r10
+	mtspr	SPRN_DBAT6U,r10
+	mtspr	SPRN_DBAT6L,r10
+	mtspr	SPRN_DBAT7U,r10
+	mtspr	SPRN_DBAT7L,r10
+	mtspr	SPRN_IBAT4U,r10
+	mtspr	SPRN_IBAT4L,r10
+	mtspr	SPRN_IBAT5U,r10
+	mtspr	SPRN_IBAT5L,r10
+	mtspr	SPRN_IBAT6U,r10
+	mtspr	SPRN_IBAT6L,r10
+	mtspr	SPRN_IBAT7U,r10
+	mtspr	SPRN_IBAT7L,r10
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+	blr
+
+flush_tlbs:
+	lis	r10, 0x40
+1:	addic.	r10, r10, -0x1000
+	tlbie	r10
+	bgt	1b
+	sync
+	blr
+
+mmu_off:
+ 	addi	r4, r3, __after_mmu_off - _start
+	mfmsr	r3
+	andi.	r0,r3,MSR_DR|MSR_IR		/* MMU enabled? */
+	beqlr
+	andc	r3,r3,r0
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	sync
+	RFI
+
+/*
+ * On 601, we use 3 BATs to map up to 24M of RAM at _PAGE_OFFSET
+ * (we keep one for debugging) and on others, we use one 256M BAT.
+ */
+initial_bats:
+	lis	r11,PAGE_OFFSET@h
+	mfspr	r9,SPRN_PVR
+	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
+	cmpwi	0,r9,1
+	bne	4f
+	ori	r11,r11,4		/* set up BAT registers for 601 */
+	li	r8,0x7f			/* valid, block length = 8MB */
+	mtspr	SPRN_IBAT0U,r11		/* N.B. 601 has valid bit in */
+	mtspr	SPRN_IBAT0L,r8		/* lower BAT register */
+	addis	r11,r11,0x800000@h
+	addis	r8,r8,0x800000@h
+	mtspr	SPRN_IBAT1U,r11
+	mtspr	SPRN_IBAT1L,r8
+	addis	r11,r11,0x800000@h
+	addis	r8,r8,0x800000@h
+	mtspr	SPRN_IBAT2U,r11
+	mtspr	SPRN_IBAT2L,r8
+	isync
+	blr
+
+4:	tophys(r8,r11)
+#ifdef CONFIG_SMP
+	ori	r8,r8,0x12		/* R/W access, M=1 */
+#else
+	ori	r8,r8,2			/* R/W access */
+#endif /* CONFIG_SMP */
+	ori	r11,r11,BL_256M<<2|0x2	/* set up BAT registers for 604 */
+
+	mtspr	SPRN_DBAT0L,r8		/* N.B. 6xx (not 601) have valid */
+	mtspr	SPRN_DBAT0U,r11		/* bit in upper BAT register */
+	mtspr	SPRN_IBAT0L,r8
+	mtspr	SPRN_IBAT0U,r11
+	isync
+	blr
+
+
+#ifdef CONFIG_BOOTX_TEXT
+setup_disp_bat:
+	/*
+	 * setup the display bat prepared for us in prom.c
+	 */
+	mflr	r8
+	bl	reloc_offset
+	mtlr	r8
+	addis	r8,r3,disp_BAT@ha
+	addi	r8,r8,disp_BAT@l
+	cmpwi	cr0,r8,0
+	beqlr
+	lwz	r11,0(r8)
+	lwz	r8,4(r8)
+	mfspr	r9,SPRN_PVR
+	rlwinm	r9,r9,16,16,31		/* r9 = 1 for 601, 4 for 604 */
+	cmpwi	0,r9,1
+	beq	1f
+	mtspr	SPRN_DBAT3L,r8
+	mtspr	SPRN_DBAT3U,r11
+	blr
+1:	mtspr	SPRN_IBAT3L,r8
+	mtspr	SPRN_IBAT3U,r11
+	blr
+#endif /* CONFIG_BOOTX_TEXT */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_CPM
+setup_cpm_bat:
+	lis	r8, 0xf000
+	ori	r8, r8,	0x002a
+	mtspr	SPRN_DBAT1L, r8
+
+	lis	r11, 0xf000
+	ori	r11, r11, (BL_1M << 2) | 2
+	mtspr	SPRN_DBAT1U, r11
+
+	blr
+#endif
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_USBGECKO
+setup_usbgecko_bat:
+	/* prepare a BAT for early io */
+#if defined(CONFIG_GAMECUBE)
+	lis	r8, 0x0c00
+#elif defined(CONFIG_WII)
+	lis	r8, 0x0d00
+#else
+#error Invalid platform for USB Gecko based early debugging.
+#endif
+	/*
+	 * The virtual address used must match the virtual address
+	 * associated to the fixmap entry FIX_EARLY_DEBUG_BASE.
+	 */
+	lis	r11, 0xfffe	/* top 128K */
+	ori	r8, r8, 0x002a	/* uncached, guarded ,rw */
+	ori	r11, r11, 0x2	/* 128K, Vs=1, Vp=0 */
+	mtspr	SPRN_DBAT1L, r8
+	mtspr	SPRN_DBAT1U, r11
+	blr
+#endif
+
+#ifdef CONFIG_8260
+/* Jump into the system reset for the rom.
+ * We first disable the MMU, and then jump to the ROM reset address.
+ *
+ * r3 is the board info structure, r4 is the location for starting.
+ * I use this for building a small kernel that can load other kernels,
+ * rather than trying to write or rely on a rom monitor that can tftp load.
+ */
+       .globl  m8260_gorom
+m8260_gorom:
+	mfmsr	r0
+	rlwinm	r0,r0,0,17,15	/* clear MSR_EE in r0 */
+	sync
+	mtmsr	r0
+	sync
+	mfspr	r11, SPRN_HID0
+	lis	r10, 0
+	ori	r10,r10,HID0_ICE|HID0_DCE
+	andc	r11, r11, r10
+	mtspr	SPRN_HID0, r11
+	isync
+	li	r5, MSR_ME|MSR_RI
+	lis	r6,2f@h
+	addis	r6,r6,-KERNELBASE@h
+	ori	r6,r6,2f@l
+	mtspr	SPRN_SRR0,r6
+	mtspr	SPRN_SRR1,r5
+	isync
+	sync
+	rfi
+2:
+	mtlr	r4
+	blr
+#endif
+
+
+/*
+ * We put a few things here that have to be page-aligned.
+ * This stuff goes at the beginning of the data segment,
+ * which is page-aligned.
+ */
+	.data
+	.globl	sdata
+sdata:
+	.globl	empty_zero_page
+empty_zero_page:
+	.space	4096
+EXPORT_SYMBOL(empty_zero_page)
+
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PGD_TABLE_SIZE
+
+/* Room for two PTE pointers, usually the kernel and current user pointers
+ * to their respective root page table.
+ */
+abatron_pteptrs:
+	.space	8
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
new file mode 100644
index 000000000..b19d78410
--- /dev/null
+++ b/arch/powerpc/kernel/head_40x.S
@@ -0,0 +1,985 @@
+/*
+ *    Copyright (c) 1995-1996 Gary Thomas <gdt@linuxppc.org>
+ *      Initial PowerPC version.
+ *    Copyright (c) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *      Rewritten for PReP
+ *    Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ *      Low-level exception handers, MMU support, and rewrite.
+ *    Copyright (c) 1997 Dan Malek <dmalek@jlc.net>
+ *      PowerPC 8xx modifications.
+ *    Copyright (c) 1998-1999 TiVo, Inc.
+ *      PowerPC 403GCX modifications.
+ *    Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu>
+ *      PowerPC 403GCX/405GP modifications.
+ *    Copyright 2000 MontaVista Software Inc.
+ *	PPC405 modifications
+ *      PowerPC 403GCX/405GP modifications.
+ * 	Author: MontaVista Software, Inc.
+ *         	frank_rowand@mvista.com or source@mvista.com
+ * 	   	debbie_chu@mvista.com
+ *
+ *
+ *    Module name: head_4xx.S
+ *
+ *    Description:
+ *      Kernel execution entry point code.
+ *
+ *    This program is free software; you can redistribute it and/or
+ *    modify it under the terms of the GNU General Public License
+ *    as published by the Free Software Foundation; either version
+ *    2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+#include <asm/asm-405.h>
+
+/* As with the other PowerPC ports, it is expected that when code
+ * execution begins here, the following registers contain valid, yet
+ * optional, information:
+ *
+ *   r3 - Board info structure pointer (DRAM, frequency, MAC address, etc.)
+ *   r4 - Starting address of the init RAM disk
+ *   r5 - Ending address of the init RAM disk
+ *   r6 - Start of kernel command line string (e.g. "mem=96m")
+ *   r7 - End of kernel command line string
+ *
+ * This is all going to change RSN when we add bi_recs.......  -- Dan
+ */
+	__HEAD
+_ENTRY(_stext);
+_ENTRY(_start);
+
+	mr	r31,r3			/* save device tree ptr */
+
+	/* We have to turn on the MMU right away so we get cache modes
+	 * set correctly.
+	 */
+	bl	initial_mmu
+
+/* We now have the lower 16 Meg mapped into TLB entries, and the caches
+ * ready to work.
+ */
+turn_on_mmu:
+	lis	r0,MSR_KERNEL@h
+	ori	r0,r0,MSR_KERNEL@l
+	mtspr	SPRN_SRR1,r0
+	lis	r0,start_here@h
+	ori	r0,r0,start_here@l
+	mtspr	SPRN_SRR0,r0
+	SYNC
+	rfi				/* enables MMU */
+	b	.			/* prevent prefetch past rfi */
+
+/*
+ * This area is used for temporarily saving registers during the
+ * critical exception prolog.
+ */
+	. = 0xc0
+crit_save:
+_ENTRY(crit_r10)
+	.space	4
+_ENTRY(crit_r11)
+	.space	4
+_ENTRY(crit_srr0)
+	.space	4
+_ENTRY(crit_srr1)
+	.space	4
+_ENTRY(saved_ksp_limit)
+	.space	4
+
+/*
+ * Exception vector entry code. This code runs with address translation
+ * turned off (i.e. using physical addresses). We assume SPRG_THREAD has
+ * the physical address of the current task thread_struct.
+ * Note that we have to have decremented r1 before we write to any fields
+ * of the exception frame, since a critical interrupt could occur at any
+ * time, and it will write to the area immediately below the current r1.
+ */
+#define NORMAL_EXCEPTION_PROLOG						     \
+	mtspr	SPRN_SPRG_SCRATCH0,r10;	/* save two registers to work with */\
+	mtspr	SPRN_SPRG_SCRATCH1,r11;					     \
+	mtspr	SPRN_SPRG_SCRATCH2,r1;					     \
+	mfcr	r10;			/* save CR in r10 for now	   */\
+	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
+	andi.	r11,r11,MSR_PR;						     \
+	beq	1f;							     \
+	mfspr	r1,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
+	lwz	r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack   */\
+	addi	r1,r1,THREAD_SIZE;					     \
+1:	subi	r1,r1,INT_FRAME_SIZE;	/* Allocate an exception frame     */\
+	tophys(r11,r1);							     \
+	stw	r10,_CCR(r11);          /* save various registers	   */\
+	stw	r12,GPR12(r11);						     \
+	stw	r9,GPR9(r11);						     \
+	mfspr	r10,SPRN_SPRG_SCRATCH0;					     \
+	stw	r10,GPR10(r11);						     \
+	mfspr	r12,SPRN_SPRG_SCRATCH1;					     \
+	stw	r12,GPR11(r11);						     \
+	mflr	r10;							     \
+	stw	r10,_LINK(r11);						     \
+	mfspr	r10,SPRN_SPRG_SCRATCH2;					     \
+	mfspr	r12,SPRN_SRR0;						     \
+	stw	r10,GPR1(r11);						     \
+	mfspr	r9,SPRN_SRR1;						     \
+	stw	r10,0(r11);						     \
+	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
+	stw	r0,GPR0(r11);						     \
+	SAVE_4GPRS(3, r11);						     \
+	SAVE_2GPRS(7, r11)
+
+/*
+ * Exception prolog for critical exceptions.  This is a little different
+ * from the normal exception prolog above since a critical exception
+ * can potentially occur at any point during normal exception processing.
+ * Thus we cannot use the same SPRG registers as the normal prolog above.
+ * Instead we use a couple of words of memory at low physical addresses.
+ * This is OK since we don't support SMP on these processors.
+ */
+#define CRITICAL_EXCEPTION_PROLOG					     \
+	stw	r10,crit_r10@l(0);	/* save two registers to work with */\
+	stw	r11,crit_r11@l(0);					     \
+	mfcr	r10;			/* save CR in r10 for now	   */\
+	mfspr	r11,SPRN_SRR3;		/* check whether user or kernel    */\
+	andi.	r11,r11,MSR_PR;						     \
+	lis	r11,critirq_ctx@ha;					     \
+	tophys(r11,r11);						     \
+	lwz	r11,critirq_ctx@l(r11);					     \
+	beq	1f;							     \
+	/* COMING FROM USER MODE */					     \
+	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
+	lwz	r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
+1:	addi	r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm  */\
+	tophys(r11,r11);						     \
+	stw	r10,_CCR(r11);          /* save various registers	   */\
+	stw	r12,GPR12(r11);						     \
+	stw	r9,GPR9(r11);						     \
+	mflr	r10;							     \
+	stw	r10,_LINK(r11);						     \
+	mfspr	r12,SPRN_DEAR;		/* save DEAR and ESR in the frame  */\
+	stw	r12,_DEAR(r11);		/* since they may have had stuff   */\
+	mfspr	r9,SPRN_ESR;		/* in them at the point where the  */\
+	stw	r9,_ESR(r11);		/* exception was taken		   */\
+	mfspr	r12,SPRN_SRR2;						     \
+	stw	r1,GPR1(r11);						     \
+	mfspr	r9,SPRN_SRR3;						     \
+	stw	r1,0(r11);						     \
+	tovirt(r1,r11);							     \
+	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
+	stw	r0,GPR0(r11);						     \
+	SAVE_4GPRS(3, r11);						     \
+	SAVE_2GPRS(7, r11)
+
+	/*
+	 * State at this point:
+	 * r9 saved in stack frame, now saved SRR3 & ~MSR_WE
+	 * r10 saved in crit_r10 and in stack frame, trashed
+	 * r11 saved in crit_r11 and in stack frame,
+	 *	now phys stack/exception frame pointer
+	 * r12 saved in stack frame, now saved SRR2
+	 * CR saved in stack frame, CR0.EQ = !SRR3.PR
+	 * LR, DEAR, ESR in stack frame
+	 * r1 saved in stack frame, now virt stack/excframe pointer
+	 * r0, r3-r8 saved in stack frame
+	 */
+
+/*
+ * Exception vectors.
+ */
+#define	START_EXCEPTION(n, label)					     \
+	. = n;								     \
+label:
+
+#define EXCEPTION(n, label, hdlr, xfer)				\
+	START_EXCEPTION(n, label);				\
+	NORMAL_EXCEPTION_PROLOG;				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+	xfer(n, hdlr)
+
+#define CRITICAL_EXCEPTION(n, label, hdlr)			\
+	START_EXCEPTION(n, label);				\
+	CRITICAL_EXCEPTION_PROLOG;				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
+			  NOCOPY, crit_transfer_to_handler,	\
+			  ret_from_crit_exc)
+
+#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
+	li	r10,trap;					\
+	stw	r10,_TRAP(r11);					\
+	lis	r10,msr@h;					\
+	ori	r10,r10,msr@l;					\
+	copyee(r10, r9);					\
+	bl	tfer;		 				\
+	.long	hdlr;						\
+	.long	ret
+
+#define COPY_EE(d, s)		rlwimi d,s,0,16,16
+#define NOCOPY(d, s)
+
+#define EXC_XFER_STD(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_LITE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
+#define EXC_XFER_EE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_EE_LITE(n, hdlr)	\
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \
+			  ret_from_except)
+
+
+/*
+ * 0x0100 - Critical Interrupt Exception
+ */
+	CRITICAL_EXCEPTION(0x0100, CriticalInterrupt, unknown_exception)
+
+/*
+ * 0x0200 - Machine Check Exception
+ */
+	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+
+/*
+ * 0x0300 - Data Storage Exception
+ * This happens for just a few reasons.  U0 set (but we don't do that),
+ * or zone protection fault (user violation, write to protected page).
+ * If this is just an update of modified status, we do that quickly
+ * and exit.  Otherwise, we call heavywight functions to do the work.
+ */
+	START_EXCEPTION(0x0300,	DataStorage)
+	mtspr	SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+#ifdef CONFIG_403GCX
+	stw     r12, 0(r0)
+	stw     r9, 4(r0)
+	mfcr    r11
+	mfspr   r12, SPRN_PID
+	stw     r11, 8(r0)
+	stw     r12, 12(r0)
+#else
+	mtspr	SPRN_SPRG_SCRATCH3, r12
+	mtspr	SPRN_SPRG_SCRATCH4, r9
+	mfcr	r11
+	mfspr	r12, SPRN_PID
+	mtspr	SPRN_SPRG_SCRATCH6, r11
+	mtspr	SPRN_SPRG_SCRATCH5, r12
+#endif
+
+	/* First, check if it was a zone fault (which means a user
+	* tried to access a kernel or read-protected page - always
+	* a SEGV).  All other faults here must be stores, so no
+	* need to check ESR_DST as well. */
+	mfspr	r10, SPRN_ESR
+	andis.	r10, r10, ESR_DIZ@h
+	bne	2f
+
+	mfspr	r10, SPRN_DEAR		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	r10, r11
+	blt+	3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+	li	r9, 0
+	mtspr	SPRN_PID, r9		/* TLB will have 0 TID */
+	b	4f
+
+	/* Get the PGD for the current thread.
+	 */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+4:
+	tophys(r11, r11)
+	rlwimi	r11, r10, 12, 20, 29	/* Create L1 (pgdir/pmd) address */
+	lwz	r11, 0(r11)		/* Get L1 entry */
+	rlwinm.	r12, r11, 0, 0, 19	/* Extract L2 (pte) base address */
+	beq	2f			/* Bail if no table */
+
+	rlwimi	r12, r10, 22, 20, 29	/* Compute PTE address */
+	lwz	r11, 0(r12)		/* Get Linux PTE */
+
+	andi.	r9, r11, _PAGE_RW	/* Is it writeable? */
+	beq	2f			/* Bail if not */
+
+	/* Update 'changed'.
+	*/
+	ori	r11, r11, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
+	stw	r11, 0(r12)		/* Update Linux page table */
+
+	/* Most of the Linux PTE is ready to load into the TLB LO.
+	 * We set ZSEL, where only the LS-bit determines user access.
+	 * We set execute, because we don't have the granularity to
+	 * properly set this at the page level (Linux problem).
+	 * If shared is set, we cause a zero PID->TID load.
+	 * Many of these bits are software only.  Bits we don't set
+	 * here we (properly should) assume have the appropriate value.
+	 */
+	li	r12, 0x0ce2
+	andc	r11, r11, r12		/* Make sure 20, 21 are zero */
+
+	/* find the TLB index that caused the fault.  It has to be here.
+	*/
+	tlbsx	r9, 0, r10
+
+	tlbwe	r11, r9, TLB_DATA		/* Load TLB LO */
+
+	/* Done...restore registers and get out of here.
+	*/
+#ifdef CONFIG_403GCX
+	lwz     r12, 12(r0)
+	lwz     r11, 8(r0)
+	mtspr   SPRN_PID, r12
+	mtcr    r11
+	lwz     r9, 4(r0)
+	lwz     r12, 0(r0)
+#else
+	mfspr	r12, SPRN_SPRG_SCRATCH5
+	mfspr	r11, SPRN_SPRG_SCRATCH6
+	mtspr	SPRN_PID, r12
+	mtcr	r11
+	mfspr	r9, SPRN_SPRG_SCRATCH4
+	mfspr	r12, SPRN_SPRG_SCRATCH3
+#endif
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	PPC405_ERR77_SYNC
+	rfi			/* Should sync shadow TLBs */
+	b	.		/* prevent prefetch past rfi */
+
+2:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+#ifdef CONFIG_403GCX
+	lwz     r12, 12(r0)
+	lwz     r11, 8(r0)
+	mtspr   SPRN_PID, r12
+	mtcr    r11
+	lwz     r9, 4(r0)
+	lwz     r12, 0(r0)
+#else
+	mfspr	r12, SPRN_SPRG_SCRATCH5
+	mfspr	r11, SPRN_SPRG_SCRATCH6
+	mtspr	SPRN_PID, r12
+	mtcr	r11
+	mfspr	r9, SPRN_SPRG_SCRATCH4
+	mfspr	r12, SPRN_SPRG_SCRATCH3
+#endif
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	b	DataAccess
+
+/*
+ * 0x0400 - Instruction Storage Exception
+ * This is caused by a fetch from non-execute or guarded pages.
+ */
+	START_EXCEPTION(0x0400, InstructionAccess)
+	NORMAL_EXCEPTION_PROLOG
+	mr	r4,r12			/* Pass SRR0 as arg2 */
+	li	r5,0			/* Pass zero as arg3 */
+	EXC_XFER_LITE(0x400, handle_page_fault)
+
+/* 0x0500 - External Interrupt Exception */
+	EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
+
+/* 0x0600 - Alignment Exception */
+	START_EXCEPTION(0x0600, Alignment)
+	NORMAL_EXCEPTION_PROLOG
+	mfspr	r4,SPRN_DEAR		/* Grab the DEAR and save it */
+	stw	r4,_DEAR(r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE(0x600, alignment_exception)
+
+/* 0x0700 - Program Exception */
+	START_EXCEPTION(0x0700, ProgramCheck)
+	NORMAL_EXCEPTION_PROLOG
+	mfspr	r4,SPRN_ESR		/* Grab the ESR and save it */
+	stw	r4,_ESR(r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_STD(0x700, program_check_exception)
+
+	EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_EE)
+
+/* 0x0C00 - System Call Exception */
+	START_EXCEPTION(0x0C00,	SystemCall)
+	NORMAL_EXCEPTION_PROLOG
+	EXC_XFER_EE_LITE(0xc00, DoSyscall)
+
+	EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE)
+
+/* 0x1000 - Programmable Interval Timer (PIT) Exception */
+	. = 0x1000
+	b Decrementer
+
+/* 0x1010 - Fixed Interval Timer (FIT) Exception
+*/
+	. = 0x1010
+	b FITException
+
+/* 0x1020 - Watchdog Timer (WDT) Exception
+*/
+	. = 0x1020
+	b WDTException
+
+/* 0x1100 - Data TLB Miss Exception
+ * As the name implies, translation is not in the MMU, so search the
+ * page tables and fix it.  The only purpose of this function is to
+ * load TLB entries from the page table if they exist.
+ */
+	START_EXCEPTION(0x1100,	DTLBMiss)
+	mtspr	SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+#ifdef CONFIG_403GCX
+	stw     r12, 0(r0)
+	stw     r9, 4(r0)
+	mfcr    r11
+	mfspr   r12, SPRN_PID
+	stw     r11, 8(r0)
+	stw     r12, 12(r0)
+#else
+	mtspr	SPRN_SPRG_SCRATCH3, r12
+	mtspr	SPRN_SPRG_SCRATCH4, r9
+	mfcr	r11
+	mfspr	r12, SPRN_PID
+	mtspr	SPRN_SPRG_SCRATCH6, r11
+	mtspr	SPRN_SPRG_SCRATCH5, r12
+#endif
+	mfspr	r10, SPRN_DEAR		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	r10, r11
+	blt+	3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+	li	r9, 0
+	mtspr	SPRN_PID, r9		/* TLB will have 0 TID */
+	b	4f
+
+	/* Get the PGD for the current thread.
+	 */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+4:
+	tophys(r11, r11)
+	rlwimi	r11, r10, 12, 20, 29	/* Create L1 (pgdir/pmd) address */
+	lwz	r12, 0(r11)		/* Get L1 entry */
+	andi.	r9, r12, _PMD_PRESENT	/* Check if it points to a PTE page */
+	beq	2f			/* Bail if no table */
+
+	rlwimi	r12, r10, 22, 20, 29	/* Compute PTE address */
+	lwz	r11, 0(r12)		/* Get Linux PTE */
+	andi.	r9, r11, _PAGE_PRESENT
+	beq	5f
+
+	ori	r11, r11, _PAGE_ACCESSED
+	stw	r11, 0(r12)
+
+	/* Create TLB tag.  This is the faulting address plus a static
+	 * set of bits.  These are size, valid, E, U0.
+	*/
+	li	r12, 0x00c0
+	rlwimi	r10, r12, 0, 20, 31
+
+	b	finish_tlb_load
+
+2:	/* Check for possible large-page pmd entry */
+	rlwinm.	r9, r12, 2, 22, 24
+	beq	5f
+
+	/* Create TLB tag.  This is the faulting address, plus a static
+	 * set of bits (valid, E, U0) plus the size from the PMD.
+	 */
+	ori	r9, r9, 0x40
+	rlwimi	r10, r9, 0, 20, 31
+	mr	r11, r12
+
+	b	finish_tlb_load
+
+5:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+#ifdef CONFIG_403GCX
+	lwz     r12, 12(r0)
+	lwz     r11, 8(r0)
+	mtspr   SPRN_PID, r12
+	mtcr    r11
+	lwz     r9, 4(r0)
+	lwz     r12, 0(r0)
+#else
+	mfspr	r12, SPRN_SPRG_SCRATCH5
+	mfspr	r11, SPRN_SPRG_SCRATCH6
+	mtspr	SPRN_PID, r12
+	mtcr	r11
+	mfspr	r9, SPRN_SPRG_SCRATCH4
+	mfspr	r12, SPRN_SPRG_SCRATCH3
+#endif
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	b	DataAccess
+
+/* 0x1200 - Instruction TLB Miss Exception
+ * Nearly the same as above, except we get our information from different
+ * registers and bailout to a different point.
+ */
+	START_EXCEPTION(0x1200,	ITLBMiss)
+	mtspr	SPRN_SPRG_SCRATCH0, r10	 /* Save some working registers */
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+#ifdef CONFIG_403GCX
+	stw     r12, 0(r0)
+	stw     r9, 4(r0)
+	mfcr    r11
+	mfspr   r12, SPRN_PID
+	stw     r11, 8(r0)
+	stw     r12, 12(r0)
+#else
+	mtspr	SPRN_SPRG_SCRATCH3, r12
+	mtspr	SPRN_SPRG_SCRATCH4, r9
+	mfcr	r11
+	mfspr	r12, SPRN_PID
+	mtspr	SPRN_SPRG_SCRATCH6, r11
+	mtspr	SPRN_SPRG_SCRATCH5, r12
+#endif
+	mfspr	r10, SPRN_SRR0		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	r10, r11
+	blt+	3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+	li	r9, 0
+	mtspr	SPRN_PID, r9		/* TLB will have 0 TID */
+	b	4f
+
+	/* Get the PGD for the current thread.
+	 */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+4:
+	tophys(r11, r11)
+	rlwimi	r11, r10, 12, 20, 29	/* Create L1 (pgdir/pmd) address */
+	lwz	r12, 0(r11)		/* Get L1 entry */
+	andi.	r9, r12, _PMD_PRESENT	/* Check if it points to a PTE page */
+	beq	2f			/* Bail if no table */
+
+	rlwimi	r12, r10, 22, 20, 29	/* Compute PTE address */
+	lwz	r11, 0(r12)		/* Get Linux PTE */
+	andi.	r9, r11, _PAGE_PRESENT
+	beq	5f
+
+	ori	r11, r11, _PAGE_ACCESSED
+	stw	r11, 0(r12)
+
+	/* Create TLB tag.  This is the faulting address plus a static
+	 * set of bits.  These are size, valid, E, U0.
+	*/
+	li	r12, 0x00c0
+	rlwimi	r10, r12, 0, 20, 31
+
+	b	finish_tlb_load
+
+2:	/* Check for possible large-page pmd entry */
+	rlwinm.	r9, r12, 2, 22, 24
+	beq	5f
+
+	/* Create TLB tag.  This is the faulting address, plus a static
+	 * set of bits (valid, E, U0) plus the size from the PMD.
+	 */
+	ori	r9, r9, 0x40
+	rlwimi	r10, r9, 0, 20, 31
+	mr	r11, r12
+
+	b	finish_tlb_load
+
+5:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+#ifdef CONFIG_403GCX
+	lwz     r12, 12(r0)
+	lwz     r11, 8(r0)
+	mtspr   SPRN_PID, r12
+	mtcr    r11
+	lwz     r9, 4(r0)
+	lwz     r12, 0(r0)
+#else
+	mfspr	r12, SPRN_SPRG_SCRATCH5
+	mfspr	r11, SPRN_SPRG_SCRATCH6
+	mtspr	SPRN_PID, r12
+	mtcr	r11
+	mfspr	r9, SPRN_SPRG_SCRATCH4
+	mfspr	r12, SPRN_SPRG_SCRATCH3
+#endif
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	b	InstructionAccess
+
+	EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_EE)
+#ifdef CONFIG_IBM405_ERR51
+	/* 405GP errata 51 */
+	START_EXCEPTION(0x1700, Trap_17)
+	b DTLBMiss
+#else
+	EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_EE)
+#endif
+	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_EE)
+
+/* Check for a single step debug exception while in an exception
+ * handler before state has been saved.  This is to catch the case
+ * where an instruction that we are trying to single step causes
+ * an exception (eg ITLB/DTLB miss) and thus the first instruction of
+ * the exception handler generates a single step debug exception.
+ *
+ * If we get a debug trap on the first instruction of an exception handler,
+ * we reset the MSR_DE in the _exception handler's_ MSR (the debug trap is
+ * a critical exception, so we are using SPRN_CSRR1 to manipulate the MSR).
+ * The exception handler was handling a non-critical interrupt, so it will
+ * save (and later restore) the MSR via SPRN_SRR1, which will still have
+ * the MSR_DE bit set.
+ */
+	/* 0x2000 - Debug Exception */
+	START_EXCEPTION(0x2000, DebugTrap)
+	CRITICAL_EXCEPTION_PROLOG
+
+	/*
+	 * If this is a single step or branch-taken exception in an
+	 * exception entry sequence, it was probably meant to apply to
+	 * the code where the exception occurred (since exception entry
+	 * doesn't turn off DE automatically).  We simulate the effect
+	 * of turning off DE on entry to an exception handler by turning
+	 * off DE in the SRR3 value and clearing the debug status.
+	 */
+	mfspr	r10,SPRN_DBSR		/* check single-step/branch taken */
+	andis.	r10,r10,DBSR_IC@h
+	beq+	2f
+
+	andi.	r10,r9,MSR_IR|MSR_PR	/* check supervisor + MMU off */
+	beq	1f			/* branch and fix it up */
+
+	mfspr   r10,SPRN_SRR2		/* Faulting instruction address */
+	cmplwi  r10,0x2100
+	bgt+    2f			/* address above exception vectors */
+
+	/* here it looks like we got an inappropriate debug exception. */
+1:	rlwinm	r9,r9,0,~MSR_DE		/* clear DE in the SRR3 value */
+	lis	r10,DBSR_IC@h		/* clear the IC event */
+	mtspr	SPRN_DBSR,r10
+	/* restore state and get out */
+	lwz	r10,_CCR(r11)
+	lwz	r0,GPR0(r11)
+	lwz	r1,GPR1(r11)
+	mtcrf	0x80,r10
+	mtspr	SPRN_SRR2,r12
+	mtspr	SPRN_SRR3,r9
+	lwz	r9,GPR9(r11)
+	lwz	r12,GPR12(r11)
+	lwz	r10,crit_r10@l(0)
+	lwz	r11,crit_r11@l(0)
+	PPC405_ERR77_SYNC
+	rfci
+	b	.
+
+	/* continue normal handling for a critical exception... */
+2:	mfspr	r4,SPRN_DBSR
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_TEMPLATE(DebugException, 0x2002, \
+		(MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
+		NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
+
+	/* Programmable Interval Timer (PIT) Exception. (from 0x1000) */
+Decrementer:
+	NORMAL_EXCEPTION_PROLOG
+	lis	r0,TSR_PIS@h
+	mtspr	SPRN_TSR,r0		/* Clear the PIT exception */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_LITE(0x1000, timer_interrupt)
+
+	/* Fixed Interval Timer (FIT) Exception. (from 0x1010) */
+FITException:
+	NORMAL_EXCEPTION_PROLOG
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	EXC_XFER_EE(0x1010, unknown_exception)
+
+	/* Watchdog Timer (WDT) Exception. (from 0x1020) */
+WDTException:
+	CRITICAL_EXCEPTION_PROLOG;
+	addi	r3,r1,STACK_FRAME_OVERHEAD;
+	EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2,
+	                  (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)),
+			  NOCOPY, crit_transfer_to_handler,
+			  ret_from_crit_exc)
+
+/*
+ * The other Data TLB exceptions bail out to this point
+ * if they can't resolve the lightweight TLB fault.
+ */
+DataAccess:
+	NORMAL_EXCEPTION_PROLOG
+	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
+	stw	r5,_ESR(r11)
+	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
+	EXC_XFER_LITE(0x300, handle_page_fault)
+
+/* Other PowerPC processors, namely those derived from the 6xx-series
+ * have vectors from 0x2100 through 0x2F00 defined, but marked as reserved.
+ * However, for the 4xx-series processors these are neither defined nor
+ * reserved.
+ */
+
+	/* Damn, I came up one instruction too many to fit into the
+	 * exception space :-).  Both the instruction and data TLB
+	 * miss get to this point to load the TLB.
+	 * 	r10 - TLB_TAG value
+	 * 	r11 - Linux PTE
+	 *	r12, r9 - available to use
+	 *	PID - loaded with proper value when we get here
+	 *	Upon exit, we reload everything and RFI.
+	 * Actually, it will fit now, but oh well.....a common place
+	 * to load the TLB.
+	 */
+tlb_4xx_index:
+	.long	0
+finish_tlb_load:
+	/* load the next available TLB index.
+	*/
+	lwz	r9, tlb_4xx_index@l(0)
+	addi	r9, r9, 1
+	andi.	r9, r9, (PPC40X_TLB_SIZE-1)
+	stw	r9, tlb_4xx_index@l(0)
+
+6:
+	/*
+	 * Clear out the software-only bits in the PTE to generate the
+	 * TLB_DATA value.  These are the bottom 2 bits of the RPM, the
+	 * top 3 bits of the zone field, and M.
+	 */
+	li	r12, 0x0ce2
+	andc	r11, r11, r12
+
+	tlbwe	r11, r9, TLB_DATA		/* Load TLB LO */
+	tlbwe	r10, r9, TLB_TAG		/* Load TLB HI */
+
+	/* Done...restore registers and get out of here.
+	*/
+#ifdef CONFIG_403GCX
+	lwz     r12, 12(r0)
+	lwz     r11, 8(r0)
+	mtspr   SPRN_PID, r12
+	mtcr    r11
+	lwz     r9, 4(r0)
+	lwz     r12, 0(r0)
+#else
+	mfspr	r12, SPRN_SPRG_SCRATCH5
+	mfspr	r11, SPRN_SPRG_SCRATCH6
+	mtspr	SPRN_PID, r12
+	mtcr	r11
+	mfspr	r9, SPRN_SPRG_SCRATCH4
+	mfspr	r12, SPRN_SPRG_SCRATCH3
+#endif
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	PPC405_ERR77_SYNC
+	rfi			/* Should sync shadow TLBs */
+	b	.		/* prevent prefetch past rfi */
+
+/* This is where the main kernel code starts.
+ */
+start_here:
+
+	/* ptr to current */
+	lis	r2,init_task@h
+	ori	r2,r2,init_task@l
+
+	/* ptr to phys current thread */
+	tophys(r4,r2)
+	addi	r4,r4,THREAD	/* init task's THREAD */
+	mtspr	SPRN_SPRG_THREAD,r4
+
+	/* stack */
+	lis	r1,init_thread_union@ha
+	addi	r1,r1,init_thread_union@l
+	li	r0,0
+	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
+
+	bl	early_init	/* We have to do this with MMU on */
+
+/*
+ * Decide what sort of machine this is and initialize the MMU.
+ */
+	li	r3,0
+	mr	r4,r31
+	bl	machine_init
+	bl	MMU_init
+
+/* Go back to running unmapped so we can load up new values
+ * and change to using our exception vectors.
+ * On the 4xx, all we have to do is invalidate the TLB to clear
+ * the old 16M byte TLB mappings.
+ */
+	lis	r4,2f@h
+	ori	r4,r4,2f@l
+	tophys(r4,r4)
+	lis	r3,(MSR_KERNEL & ~(MSR_IR|MSR_DR))@h
+	ori	r3,r3,(MSR_KERNEL & ~(MSR_IR|MSR_DR))@l
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	rfi
+	b	.		/* prevent prefetch past rfi */
+
+/* Load up the kernel context */
+2:
+	sync			/* Flush to memory before changing TLB */
+	tlbia
+	isync			/* Flush shadow TLBs */
+
+	/* set up the PTE pointers for the Abatron bdiGDB.
+	*/
+	lis	r6, swapper_pg_dir@h
+	ori	r6, r6, swapper_pg_dir@l
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r5, 0xf0(r0)	/* Must match your Abatron config file */
+	tophys(r5,r5)
+	stw	r6, 0(r5)
+
+/* Now turn on the MMU for real! */
+	lis	r4,MSR_KERNEL@h
+	ori	r4,r4,MSR_KERNEL@l
+	lis	r3,start_kernel@h
+	ori	r3,r3,start_kernel@l
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	rfi			/* enable MMU and jump to start_kernel */
+	b	.		/* prevent prefetch past rfi */
+
+/* Set up the initial MMU state so we can do the first level of
+ * kernel initialization.  This maps the first 16 MBytes of memory 1:1
+ * virtual to physical and more importantly sets the cache mode.
+ */
+initial_mmu:
+	tlbia			/* Invalidate all TLB entries */
+	isync
+
+	/* We should still be executing code at physical address 0x0000xxxx
+	 * at this point. However, start_here is at virtual address
+	 * 0xC000xxxx. So, set up a TLB mapping to cover this once
+	 * translation is enabled.
+	 */
+
+	lis	r3,KERNELBASE@h		/* Load the kernel virtual address */
+	ori	r3,r3,KERNELBASE@l
+	tophys(r4,r3)			/* Load the kernel physical address */
+
+	iccci	r0,r3			/* Invalidate the i-cache before use */
+
+	/* Load the kernel PID.
+	*/
+	li	r0,0
+	mtspr	SPRN_PID,r0
+	sync
+
+	/* Configure and load one entry into TLB slots 63 */
+	clrrwi	r4,r4,10		/* Mask off the real page number */
+	ori	r4,r4,(TLB_WR | TLB_EX)	/* Set the write and execute bits */
+
+	clrrwi	r3,r3,10		/* Mask off the effective page number */
+	ori	r3,r3,(TLB_VALID | TLB_PAGESZ(PAGESZ_16M))
+
+        li      r0,63                    /* TLB slot 63 */
+
+	tlbwe	r4,r0,TLB_DATA		/* Load the data portion of the entry */
+	tlbwe	r3,r0,TLB_TAG		/* Load the tag portion of the entry */
+
+	isync
+
+	/* Establish the exception vector base
+	*/
+	lis	r4,KERNELBASE@h		/* EVPR only uses the high 16-bits */
+	tophys(r0,r4)			/* Use the physical address */
+	mtspr	SPRN_EVPR,r0
+
+	blr
+
+_GLOBAL(abort)
+        mfspr   r13,SPRN_DBCR0
+        oris    r13,r13,DBCR0_RST_SYSTEM@h
+        mtspr   SPRN_DBCR0,r13
+
+_GLOBAL(set_context)
+
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is the second parameter.
+	 */
+	lis	r5, KERNELBASE@h
+	lwz	r5, 0xf0(r5)
+	stw	r4, 0x4(r5)
+#endif
+	sync
+	mtspr	SPRN_PID,r3
+	isync				/* Need an isync to flush shadow */
+					/* TLBs after changing PID */
+	blr
+
+/* We put a few things here that have to be page-aligned. This stuff
+ * goes at the beginning of the data segment, which is page-aligned.
+ */
+	.data
+	.align	12
+	.globl	sdata
+sdata:
+	.globl	empty_zero_page
+empty_zero_page:
+	.space	4096
+EXPORT_SYMBOL(empty_zero_page)
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PGD_TABLE_SIZE
+
+/* Room for two PTE pointers, usually the kernel and current user pointers
+ * to their respective root page table.
+ */
+abatron_pteptrs:
+	.space	8
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
new file mode 100644
index 000000000..37e4a7cf0
--- /dev/null
+++ b/arch/powerpc/kernel/head_44x.S
@@ -0,0 +1,1278 @@
+/*
+ * Kernel execution entry point code.
+ *
+ *    Copyright (c) 1995-1996 Gary Thomas <gdt@linuxppc.org>
+ *      Initial PowerPC version.
+ *    Copyright (c) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *      Rewritten for PReP
+ *    Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ *      Low-level exception handers, MMU support, and rewrite.
+ *    Copyright (c) 1997 Dan Malek <dmalek@jlc.net>
+ *      PowerPC 8xx modifications.
+ *    Copyright (c) 1998-1999 TiVo, Inc.
+ *      PowerPC 403GCX modifications.
+ *    Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu>
+ *      PowerPC 403GCX/405GP modifications.
+ *    Copyright 2000 MontaVista Software Inc.
+ *	PPC405 modifications
+ *      PowerPC 403GCX/405GP modifications.
+ * 	Author: MontaVista Software, Inc.
+ *         	frank_rowand@mvista.com or source@mvista.com
+ * 	   	debbie_chu@mvista.com
+ *    Copyright 2002-2005 MontaVista Software, Inc.
+ *      PowerPC 44x support, Matt Porter <mporter@kernel.crashing.org>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/synch.h>
+#include <asm/export.h>
+#include "head_booke.h"
+
+
+/* As with the other PowerPC ports, it is expected that when code
+ * execution begins here, the following registers contain valid, yet
+ * optional, information:
+ *
+ *   r3 - Board info structure pointer (DRAM, frequency, MAC address, etc.)
+ *   r4 - Starting address of the init RAM disk
+ *   r5 - Ending address of the init RAM disk
+ *   r6 - Start of kernel command line string (e.g. "mem=128")
+ *   r7 - End of kernel command line string
+ *
+ */
+	__HEAD
+_ENTRY(_stext);
+_ENTRY(_start);
+	/*
+	 * Reserve a word at a fixed location to store the address
+	 * of abatron_pteptrs
+	 */
+	nop
+	mr	r31,r3		/* save device tree ptr */
+	li	r24,0		/* CPU number */
+
+#ifdef CONFIG_RELOCATABLE
+/*
+ * Relocate ourselves to the current runtime address.
+ * This is called only by the Boot CPU.
+ * "relocate" is called with our current runtime virutal
+ * address.
+ * r21 will be loaded with the physical runtime address of _stext
+ */
+	bl	0f				/* Get our runtime address */
+0:	mflr	r21				/* Make it accessible */
+	addis	r21,r21,(_stext - 0b)@ha
+	addi	r21,r21,(_stext - 0b)@l 	/* Get our current runtime base */
+
+	/*
+	 * We have the runtime (virutal) address of our base.
+	 * We calculate our shift of offset from a 256M page.
+	 * We could map the 256M page we belong to at PAGE_OFFSET and
+	 * get going from there.
+	 */
+	lis	r4,KERNELBASE@h
+	ori	r4,r4,KERNELBASE@l
+	rlwinm	r6,r21,0,4,31			/* r6 = PHYS_START % 256M */
+	rlwinm	r5,r4,0,4,31			/* r5 = KERNELBASE % 256M */
+	subf	r3,r5,r6			/* r3 = r6 - r5 */
+	add	r3,r4,r3			/* Required Virutal Address */
+
+	bl	relocate
+#endif
+
+	bl	init_cpu_state
+
+	/*
+	 * This is where the main kernel code starts.
+	 */
+
+	/* ptr to current */
+	lis	r2,init_task@h
+	ori	r2,r2,init_task@l
+
+	/* ptr to current thread */
+	addi	r4,r2,THREAD	/* init task's THREAD */
+	mtspr	SPRN_SPRG_THREAD,r4
+
+	/* stack */
+	lis	r1,init_thread_union@h
+	ori	r1,r1,init_thread_union@l
+	li	r0,0
+	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
+
+	bl	early_init
+
+#ifdef CONFIG_RELOCATABLE
+	/*
+	 * Relocatable kernel support based on processing of dynamic
+	 * relocation entries.
+	 *
+	 * r25 will contain RPN/ERPN for the start address of memory
+	 * r21 will contain the current offset of _stext
+	 */
+	lis	r3,kernstart_addr@ha
+	la	r3,kernstart_addr@l(r3)
+
+	/*
+	 * Compute the kernstart_addr.
+	 * kernstart_addr => (r6,r8)
+	 * kernstart_addr & ~0xfffffff => (r6,r7)
+	 */
+	rlwinm	r6,r25,0,28,31	/* ERPN. Bits 32-35 of Address */
+	rlwinm	r7,r25,0,0,3	/* RPN - assuming 256 MB page size */
+	rlwinm	r8,r21,0,4,31	/* r8 = (_stext & 0xfffffff) */
+	or	r8,r7,r8	/* Compute the lower 32bit of kernstart_addr */
+
+	/* Store kernstart_addr */
+	stw	r6,0(r3)	/* higher 32bit */
+	stw	r8,4(r3)	/* lower 32bit  */
+
+	/*
+	 * Compute the virt_phys_offset :
+	 * virt_phys_offset = stext.run - kernstart_addr
+	 *
+	 * stext.run = (KERNELBASE & ~0xfffffff) + (kernstart_addr & 0xfffffff)
+	 * When we relocate, we have :
+	 *
+	 *	(kernstart_addr & 0xfffffff) = (stext.run & 0xfffffff)
+	 *
+	 * hence:
+	 *  virt_phys_offset = (KERNELBASE & ~0xfffffff) - (kernstart_addr & ~0xfffffff)
+	 *
+	 */
+
+	/* KERNELBASE&~0xfffffff => (r4,r5) */
+	li	r4, 0		/* higer 32bit */
+	lis	r5,KERNELBASE@h
+	rlwinm	r5,r5,0,0,3	/* Align to 256M, lower 32bit */
+
+	/*
+	 * 64bit subtraction.
+	 */
+	subfc	r5,r7,r5
+	subfe	r4,r6,r4
+
+	/* Store virt_phys_offset */
+	lis	r3,virt_phys_offset@ha
+	la	r3,virt_phys_offset@l(r3)
+
+	stw	r4,0(r3)
+	stw	r5,4(r3)
+
+#elif defined(CONFIG_DYNAMIC_MEMSTART)
+	/*
+	 * Mapping based, page aligned dynamic kernel loading.
+	 *
+	 * r25 will contain RPN/ERPN for the start address of memory
+	 *
+	 * Add the difference between KERNELBASE and PAGE_OFFSET to the
+	 * start of physical memory to get kernstart_addr.
+	 */
+	lis	r3,kernstart_addr@ha
+	la	r3,kernstart_addr@l(r3)
+
+	lis	r4,KERNELBASE@h
+	ori	r4,r4,KERNELBASE@l
+	lis	r5,PAGE_OFFSET@h
+	ori	r5,r5,PAGE_OFFSET@l
+	subf	r4,r5,r4
+
+	rlwinm	r6,r25,0,28,31	/* ERPN */
+	rlwinm	r7,r25,0,0,3	/* RPN - assuming 256 MB page size */
+	add	r7,r7,r4
+
+	stw	r6,0(r3)
+	stw	r7,4(r3)
+#endif
+
+/*
+ * Decide what sort of machine this is and initialize the MMU.
+ */
+	li	r3,0
+	mr	r4,r31
+	bl	machine_init
+	bl	MMU_init
+
+	/* Setup PTE pointers for the Abatron bdiGDB */
+	lis	r6, swapper_pg_dir@h
+	ori	r6, r6, swapper_pg_dir@l
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	lis	r4, KERNELBASE@h
+	ori	r4, r4, KERNELBASE@l
+	stw	r5, 0(r4)	/* Save abatron_pteptrs at a fixed location */
+	stw	r6, 0(r5)
+
+	/* Clear the Machine Check Syndrome Register */
+	li	r0,0
+	mtspr	SPRN_MCSR,r0
+
+	/* Let's move on */
+	lis	r4,start_kernel@h
+	ori	r4,r4,start_kernel@l
+	lis	r3,MSR_KERNEL@h
+	ori	r3,r3,MSR_KERNEL@l
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	rfi			/* change context and jump to start_kernel */
+
+/*
+ * Interrupt vector entry code
+ *
+ * The Book E MMUs are always on so we don't need to handle
+ * interrupts in real mode as with previous PPC processors. In
+ * this case we handle interrupts in the kernel virtual address
+ * space.
+ *
+ * Interrupt vectors are dynamically placed relative to the
+ * interrupt prefix as determined by the address of interrupt_base.
+ * The interrupt vectors offsets are programmed using the labels
+ * for each interrupt vector entry.
+ *
+ * Interrupt vectors must be aligned on a 16 byte boundary.
+ * We align on a 32 byte cache line boundary for good measure.
+ */
+
+interrupt_base:
+	/* Critical Input Interrupt */
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
+
+	/* Machine Check Interrupt */
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
+	MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception)
+
+	/* Data Storage Interrupt */
+	DATA_STORAGE_EXCEPTION
+
+		/* Instruction Storage Interrupt */
+	INSTRUCTION_STORAGE_EXCEPTION
+
+	/* External Input Interrupt */
+	EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \
+		  do_IRQ, EXC_XFER_LITE)
+
+	/* Alignment Interrupt */
+	ALIGNMENT_EXCEPTION
+
+	/* Program Interrupt */
+	PROGRAM_EXCEPTION
+
+	/* Floating Point Unavailable Interrupt */
+#ifdef CONFIG_PPC_FPU
+	FP_UNAVAILABLE_EXCEPTION
+#else
+	EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \
+		  FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+#endif
+	/* System Call Interrupt */
+	START_EXCEPTION(SystemCall)
+	NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
+	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
+
+	/* Auxiliary Processor Unavailable Interrupt */
+	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
+		  AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+
+	/* Decrementer Interrupt */
+	DECREMENTER_EXCEPTION
+
+	/* Fixed Internal Timer Interrupt */
+	/* TODO: Add FIT support */
+	EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
+
+	/* Watchdog Timer Interrupt */
+	/* TODO: Add watchdog support */
+#ifdef CONFIG_BOOKE_WDT
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException)
+#else
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception)
+#endif
+
+	/* Data TLB Error Interrupt */
+	START_EXCEPTION(DataTLBError44x)
+	mtspr	SPRN_SPRG_WSCRATCH0, r10		/* Save some working registers */
+	mtspr	SPRN_SPRG_WSCRATCH1, r11
+	mtspr	SPRN_SPRG_WSCRATCH2, r12
+	mtspr	SPRN_SPRG_WSCRATCH3, r13
+	mfcr	r11
+	mtspr	SPRN_SPRG_WSCRATCH4, r11
+	mfspr	r10, SPRN_DEAR		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	r10, r11
+	blt+	3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+
+	mfspr	r12,SPRN_MMUCR
+	rlwinm	r12,r12,0,0,23		/* Clear TID */
+
+	b	4f
+
+	/* Get the PGD for the current thread */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+
+	/* Load PID into MMUCR TID */
+	mfspr	r12,SPRN_MMUCR
+	mfspr   r13,SPRN_PID		/* Get PID */
+	rlwimi	r12,r13,0,24,31		/* Set TID */
+
+4:
+	mtspr	SPRN_MMUCR,r12
+
+	/* Mask of required permission bits. Note that while we
+	 * do copy ESR:ST to _PAGE_RW position as trying to write
+	 * to an RO page is pretty common, we don't do it with
+	 * _PAGE_DIRTY. We could do it, but it's a fairly rare
+	 * event so I'd rather take the overhead when it happens
+	 * rather than adding an instruction here. We should measure
+	 * whether the whole thing is worth it in the first place
+	 * as we could avoid loading SPRN_ESR completely in the first
+	 * place...
+	 *
+	 * TODO: Is it worth doing that mfspr & rlwimi in the first
+	 *       place or can we save a couple of instructions here ?
+	 */
+	mfspr	r12,SPRN_ESR
+	li	r13,_PAGE_PRESENT|_PAGE_ACCESSED
+	rlwimi	r13,r12,10,30,30
+
+	/* Load the PTE */
+	/* Compute pgdir/pmd offset */
+	rlwinm  r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29
+	lwzx	r11, r12, r11		/* Get pgd/pmd entry */
+	rlwinm.	r12, r11, 0, 0, 20	/* Extract pt base address */
+	beq	2f			/* Bail if no table */
+
+	/* Compute pte address */
+	rlwimi  r12, r10, PPC44x_PTE_ADD_SHIFT, PPC44x_PTE_ADD_MASK_BIT, 28
+	lwz	r11, 0(r12)		/* Get high word of pte entry */
+	lwz	r12, 4(r12)		/* Get low word of pte entry */
+
+	lis	r10,tlb_44x_index@ha
+
+	andc.	r13,r13,r12		/* Check permission */
+
+	/* Load the next available TLB index */
+	lwz	r13,tlb_44x_index@l(r10)
+
+	bne	2f			/* Bail if permission mismach */
+
+	/* Increment, rollover, and store TLB index */
+	addi	r13,r13,1
+
+	/* Compare with watermark (instruction gets patched) */
+	.globl tlb_44x_patch_hwater_D
+tlb_44x_patch_hwater_D:
+	cmpwi	0,r13,1			/* reserve entries */
+	ble	5f
+	li	r13,0
+5:
+	/* Store the next available TLB index */
+	stw	r13,tlb_44x_index@l(r10)
+
+	/* Re-load the faulting address */
+	mfspr	r10,SPRN_DEAR
+
+	 /* Jump to common tlb load */
+	b	finish_tlb_load_44x
+
+2:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+	mfspr	r11, SPRN_SPRG_RSCRATCH4
+	mtcr	r11
+	mfspr	r13, SPRN_SPRG_RSCRATCH3
+	mfspr	r12, SPRN_SPRG_RSCRATCH2
+	mfspr	r11, SPRN_SPRG_RSCRATCH1
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	b	DataStorage
+
+	/* Instruction TLB Error Interrupt */
+	/*
+	 * Nearly the same as above, except we get our
+	 * information from different registers and bailout
+	 * to a different point.
+	 */
+	START_EXCEPTION(InstructionTLBError44x)
+	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
+	mtspr	SPRN_SPRG_WSCRATCH1, r11
+	mtspr	SPRN_SPRG_WSCRATCH2, r12
+	mtspr	SPRN_SPRG_WSCRATCH3, r13
+	mfcr	r11
+	mtspr	SPRN_SPRG_WSCRATCH4, r11
+	mfspr	r10, SPRN_SRR0		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	r10, r11
+	blt+	3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+
+	mfspr	r12,SPRN_MMUCR
+	rlwinm	r12,r12,0,0,23		/* Clear TID */
+
+	b	4f
+
+	/* Get the PGD for the current thread */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+
+	/* Load PID into MMUCR TID */
+	mfspr	r12,SPRN_MMUCR
+	mfspr   r13,SPRN_PID		/* Get PID */
+	rlwimi	r12,r13,0,24,31		/* Set TID */
+
+4:
+	mtspr	SPRN_MMUCR,r12
+
+	/* Make up the required permissions */
+	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
+
+	/* Compute pgdir/pmd offset */
+	rlwinm 	r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29
+	lwzx	r11, r12, r11		/* Get pgd/pmd entry */
+	rlwinm.	r12, r11, 0, 0, 20	/* Extract pt base address */
+	beq	2f			/* Bail if no table */
+
+	/* Compute pte address */
+	rlwimi	r12, r10, PPC44x_PTE_ADD_SHIFT, PPC44x_PTE_ADD_MASK_BIT, 28
+	lwz	r11, 0(r12)		/* Get high word of pte entry */
+	lwz	r12, 4(r12)		/* Get low word of pte entry */
+
+	lis	r10,tlb_44x_index@ha
+
+	andc.	r13,r13,r12		/* Check permission */
+
+	/* Load the next available TLB index */
+	lwz	r13,tlb_44x_index@l(r10)
+
+	bne	2f			/* Bail if permission mismach */
+
+	/* Increment, rollover, and store TLB index */
+	addi	r13,r13,1
+
+	/* Compare with watermark (instruction gets patched) */
+	.globl tlb_44x_patch_hwater_I
+tlb_44x_patch_hwater_I:
+	cmpwi	0,r13,1			/* reserve entries */
+	ble	5f
+	li	r13,0
+5:
+	/* Store the next available TLB index */
+	stw	r13,tlb_44x_index@l(r10)
+
+	/* Re-load the faulting address */
+	mfspr	r10,SPRN_SRR0
+
+	/* Jump to common TLB load point */
+	b	finish_tlb_load_44x
+
+2:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+	mfspr	r11, SPRN_SPRG_RSCRATCH4
+	mtcr	r11
+	mfspr	r13, SPRN_SPRG_RSCRATCH3
+	mfspr	r12, SPRN_SPRG_RSCRATCH2
+	mfspr	r11, SPRN_SPRG_RSCRATCH1
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	b	InstructionStorage
+
+/*
+ * Both the instruction and data TLB miss get to this
+ * point to load the TLB.
+ * 	r10 - EA of fault
+ * 	r11 - PTE high word value
+ *	r12 - PTE low word value
+ *	r13 - TLB index
+ *	MMUCR - loaded with proper value when we get here
+ *	Upon exit, we reload everything and RFI.
+ */
+finish_tlb_load_44x:
+	/* Combine RPN & ERPN an write WS 0 */
+	rlwimi	r11,r12,0,0,31-PAGE_SHIFT
+	tlbwe	r11,r13,PPC44x_TLB_XLAT
+
+	/*
+	 * Create WS1. This is the faulting address (EPN),
+	 * page size, and valid flag.
+	 */
+	li	r11,PPC44x_TLB_VALID | PPC44x_TLBE_SIZE
+	/* Insert valid and page size */
+	rlwimi	r10,r11,0,PPC44x_PTE_ADD_MASK_BIT,31
+	tlbwe	r10,r13,PPC44x_TLB_PAGEID	/* Write PAGEID */
+
+	/* And WS 2 */
+	li	r10,0xf85			/* Mask to apply from PTE */
+	rlwimi	r10,r12,29,30,30		/* DIRTY -> SW position */
+	and	r11,r12,r10			/* Mask PTE bits to keep */
+	andi.	r10,r12,_PAGE_USER		/* User page ? */
+	beq	1f				/* nope, leave U bits empty */
+	rlwimi	r11,r11,3,26,28			/* yes, copy S bits to U */
+1:	tlbwe	r11,r13,PPC44x_TLB_ATTRIB	/* Write ATTRIB */
+
+	/* Done...restore registers and get out of here.
+	*/
+	mfspr	r11, SPRN_SPRG_RSCRATCH4
+	mtcr	r11
+	mfspr	r13, SPRN_SPRG_RSCRATCH3
+	mfspr	r12, SPRN_SPRG_RSCRATCH2
+	mfspr	r11, SPRN_SPRG_RSCRATCH1
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	rfi					/* Force context change */
+
+/* TLB error interrupts for 476
+ */
+#ifdef CONFIG_PPC_47x
+	START_EXCEPTION(DataTLBError47x)
+	mtspr	SPRN_SPRG_WSCRATCH0,r10	/* Save some working registers */
+	mtspr	SPRN_SPRG_WSCRATCH1,r11
+	mtspr	SPRN_SPRG_WSCRATCH2,r12
+	mtspr	SPRN_SPRG_WSCRATCH3,r13
+	mfcr	r11
+	mtspr	SPRN_SPRG_WSCRATCH4,r11
+	mfspr	r10,SPRN_DEAR		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11,PAGE_OFFSET@h
+	cmplw	cr0,r10,r11
+	blt+	3f
+	lis	r11,swapper_pg_dir@h
+	ori	r11,r11, swapper_pg_dir@l
+	li	r12,0			/* MMUCR = 0 */
+	b	4f
+
+	/* Get the PGD for the current thread and setup MMUCR */
+3:	mfspr	r11,SPRN_SPRG3
+	lwz	r11,PGDIR(r11)
+	mfspr   r12,SPRN_PID		/* Get PID */
+4:	mtspr	SPRN_MMUCR,r12		/* Set MMUCR */
+
+	/* Mask of required permission bits. Note that while we
+	 * do copy ESR:ST to _PAGE_RW position as trying to write
+	 * to an RO page is pretty common, we don't do it with
+	 * _PAGE_DIRTY. We could do it, but it's a fairly rare
+	 * event so I'd rather take the overhead when it happens
+	 * rather than adding an instruction here. We should measure
+	 * whether the whole thing is worth it in the first place
+	 * as we could avoid loading SPRN_ESR completely in the first
+	 * place...
+	 *
+	 * TODO: Is it worth doing that mfspr & rlwimi in the first
+	 *       place or can we save a couple of instructions here ?
+	 */
+	mfspr	r12,SPRN_ESR
+	li	r13,_PAGE_PRESENT|_PAGE_ACCESSED
+	rlwimi	r13,r12,10,30,30
+
+	/* Load the PTE */
+	/* Compute pgdir/pmd offset */
+	rlwinm  r12,r10,PPC44x_PGD_OFF_SHIFT,PPC44x_PGD_OFF_MASK_BIT,29
+	lwzx	r11,r12,r11		/* Get pgd/pmd entry */
+
+	/* Word 0 is EPN,V,TS,DSIZ */
+	li	r12,PPC47x_TLB0_VALID | PPC47x_TLBE_SIZE
+	rlwimi	r10,r12,0,32-PAGE_SHIFT,31	/* Insert valid and page size*/
+	li	r12,0
+	tlbwe	r10,r12,0
+
+	/* XXX can we do better ? Need to make sure tlbwe has established
+	 * latch V bit in MMUCR0 before the PTE is loaded further down */
+#ifdef CONFIG_SMP
+	isync
+#endif
+
+	rlwinm.	r12,r11,0,0,20		/* Extract pt base address */
+	/* Compute pte address */
+	rlwimi  r12,r10,PPC44x_PTE_ADD_SHIFT,PPC44x_PTE_ADD_MASK_BIT,28
+	beq	2f			/* Bail if no table */
+	lwz	r11,0(r12)		/* Get high word of pte entry */
+
+	/* XXX can we do better ? maybe insert a known 0 bit from r11 into the
+	 * bottom of r12 to create a data dependency... We can also use r10
+	 * as destination nowadays
+	 */
+#ifdef CONFIG_SMP
+	lwsync
+#endif
+	lwz	r12,4(r12)		/* Get low word of pte entry */
+
+	andc.	r13,r13,r12		/* Check permission */
+
+	 /* Jump to common tlb load */
+	beq	finish_tlb_load_47x
+
+2:	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+	mfspr	r11,SPRN_SPRG_RSCRATCH4
+	mtcr	r11
+	mfspr	r13,SPRN_SPRG_RSCRATCH3
+	mfspr	r12,SPRN_SPRG_RSCRATCH2
+	mfspr	r11,SPRN_SPRG_RSCRATCH1
+	mfspr	r10,SPRN_SPRG_RSCRATCH0
+	b	DataStorage
+
+	/* Instruction TLB Error Interrupt */
+	/*
+	 * Nearly the same as above, except we get our
+	 * information from different registers and bailout
+	 * to a different point.
+	 */
+	START_EXCEPTION(InstructionTLBError47x)
+	mtspr	SPRN_SPRG_WSCRATCH0,r10	/* Save some working registers */
+	mtspr	SPRN_SPRG_WSCRATCH1,r11
+	mtspr	SPRN_SPRG_WSCRATCH2,r12
+	mtspr	SPRN_SPRG_WSCRATCH3,r13
+	mfcr	r11
+	mtspr	SPRN_SPRG_WSCRATCH4,r11
+	mfspr	r10,SPRN_SRR0		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11,PAGE_OFFSET@h
+	cmplw	cr0,r10,r11
+	blt+	3f
+	lis	r11,swapper_pg_dir@h
+	ori	r11,r11, swapper_pg_dir@l
+	li	r12,0			/* MMUCR = 0 */
+	b	4f
+
+	/* Get the PGD for the current thread and setup MMUCR */
+3:	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+	mfspr   r12,SPRN_PID		/* Get PID */
+4:	mtspr	SPRN_MMUCR,r12		/* Set MMUCR */
+
+	/* Make up the required permissions */
+	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
+
+	/* Load PTE */
+	/* Compute pgdir/pmd offset */
+	rlwinm  r12,r10,PPC44x_PGD_OFF_SHIFT,PPC44x_PGD_OFF_MASK_BIT,29
+	lwzx	r11,r12,r11		/* Get pgd/pmd entry */
+
+	/* Word 0 is EPN,V,TS,DSIZ */
+	li	r12,PPC47x_TLB0_VALID | PPC47x_TLBE_SIZE
+	rlwimi	r10,r12,0,32-PAGE_SHIFT,31	/* Insert valid and page size*/
+	li	r12,0
+	tlbwe	r10,r12,0
+
+	/* XXX can we do better ? Need to make sure tlbwe has established
+	 * latch V bit in MMUCR0 before the PTE is loaded further down */
+#ifdef CONFIG_SMP
+	isync
+#endif
+
+	rlwinm.	r12,r11,0,0,20		/* Extract pt base address */
+	/* Compute pte address */
+	rlwimi  r12,r10,PPC44x_PTE_ADD_SHIFT,PPC44x_PTE_ADD_MASK_BIT,28
+	beq	2f			/* Bail if no table */
+
+	lwz	r11,0(r12)		/* Get high word of pte entry */
+	/* XXX can we do better ? maybe insert a known 0 bit from r11 into the
+	 * bottom of r12 to create a data dependency... We can also use r10
+	 * as destination nowadays
+	 */
+#ifdef CONFIG_SMP
+	lwsync
+#endif
+	lwz	r12,4(r12)		/* Get low word of pte entry */
+
+	andc.	r13,r13,r12		/* Check permission */
+
+	/* Jump to common TLB load point */
+	beq	finish_tlb_load_47x
+
+2:	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+	mfspr	r11, SPRN_SPRG_RSCRATCH4
+	mtcr	r11
+	mfspr	r13, SPRN_SPRG_RSCRATCH3
+	mfspr	r12, SPRN_SPRG_RSCRATCH2
+	mfspr	r11, SPRN_SPRG_RSCRATCH1
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	b	InstructionStorage
+
+/*
+ * Both the instruction and data TLB miss get to this
+ * point to load the TLB.
+ * 	r10 - free to use
+ * 	r11 - PTE high word value
+ *	r12 - PTE low word value
+ *      r13 - free to use
+ *	MMUCR - loaded with proper value when we get here
+ *	Upon exit, we reload everything and RFI.
+ */
+finish_tlb_load_47x:
+	/* Combine RPN & ERPN an write WS 1 */
+	rlwimi	r11,r12,0,0,31-PAGE_SHIFT
+	tlbwe	r11,r13,1
+
+	/* And make up word 2 */
+	li	r10,0xf85			/* Mask to apply from PTE */
+	rlwimi	r10,r12,29,30,30		/* DIRTY -> SW position */
+	and	r11,r12,r10			/* Mask PTE bits to keep */
+	andi.	r10,r12,_PAGE_USER		/* User page ? */
+	beq	1f				/* nope, leave U bits empty */
+	rlwimi	r11,r11,3,26,28			/* yes, copy S bits to U */
+1:	tlbwe	r11,r13,2
+
+	/* Done...restore registers and get out of here.
+	*/
+	mfspr	r11, SPRN_SPRG_RSCRATCH4
+	mtcr	r11
+	mfspr	r13, SPRN_SPRG_RSCRATCH3
+	mfspr	r12, SPRN_SPRG_RSCRATCH2
+	mfspr	r11, SPRN_SPRG_RSCRATCH1
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	rfi
+
+#endif /* CONFIG_PPC_47x */
+
+	/* Debug Interrupt */
+	/*
+	 * This statement needs to exist at the end of the IVPR
+	 * definition just in case you end up taking a debug
+	 * exception within another exception.
+	 */
+	DEBUG_CRIT_EXCEPTION
+
+interrupt_end:
+
+/*
+ * Global functions
+ */
+
+/*
+ * Adjust the machine check IVOR on 440A cores
+ */
+_GLOBAL(__fixup_440A_mcheck)
+	li	r3,MachineCheckA@l
+	mtspr	SPRN_IVOR1,r3
+	sync
+	blr
+
+_GLOBAL(set_context)
+
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is the second parameter.
+	 */
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r4, 0x4(r5)
+#endif
+	mtspr	SPRN_PID,r3
+	isync			/* Force context change */
+	blr
+
+/*
+ * Init CPU state. This is called at boot time or for secondary CPUs
+ * to setup initial TLB entries, setup IVORs, etc...
+ *
+ */
+_GLOBAL(init_cpu_state)
+	mflr	r22
+#ifdef CONFIG_PPC_47x
+	/* We use the PVR to differentiate 44x cores from 476 */
+	mfspr	r3,SPRN_PVR
+	srwi	r3,r3,16
+	cmplwi	cr0,r3,PVR_476FPE@h
+	beq	head_start_47x
+	cmplwi	cr0,r3,PVR_476@h
+	beq	head_start_47x
+	cmplwi	cr0,r3,PVR_476_ISS@h
+	beq	head_start_47x
+#endif /* CONFIG_PPC_47x */
+
+/*
+ * In case the firmware didn't do it, we apply some workarounds
+ * that are good for all 440 core variants here
+ */
+	mfspr	r3,SPRN_CCR0
+	rlwinm	r3,r3,0,0,27	/* disable icache prefetch */
+	isync
+	mtspr	SPRN_CCR0,r3
+	isync
+	sync
+
+/*
+ * Set up the initial MMU state for 44x
+ *
+ * We are still executing code at the virtual address
+ * mappings set by the firmware for the base of RAM.
+ *
+ * We first invalidate all TLB entries but the one
+ * we are running from.  We then load the KERNELBASE
+ * mappings so we can begin to use kernel addresses
+ * natively and so the interrupt vector locations are
+ * permanently pinned (necessary since Book E
+ * implementations always have translation enabled).
+ *
+ * TODO: Use the known TLB entry we are running from to
+ *	 determine which physical region we are located
+ *	 in.  This can be used to determine where in RAM
+ *	 (on a shared CPU system) or PCI memory space
+ *	 (on a DRAMless system) we are located.
+ *       For now, we assume a perfect world which means
+ *	 we are located at the base of DRAM (physical 0).
+ */
+
+/*
+ * Search TLB for entry that we are currently using.
+ * Invalidate all entries but the one we are using.
+ */
+	/* Load our current PID->MMUCR TID and MSR IS->MMUCR STS */
+	mfspr	r3,SPRN_PID			/* Get PID */
+	mfmsr	r4				/* Get MSR */
+	andi.	r4,r4,MSR_IS@l			/* TS=1? */
+	beq	wmmucr				/* If not, leave STS=0 */
+	oris	r3,r3,PPC44x_MMUCR_STS@h	/* Set STS=1 */
+wmmucr:	mtspr	SPRN_MMUCR,r3			/* Put MMUCR */
+	sync
+
+	bl	invstr				/* Find our address */
+invstr:	mflr	r5				/* Make it accessible */
+	tlbsx	r23,0,r5			/* Find entry we are in */
+	li	r4,0				/* Start at TLB entry 0 */
+	li	r3,0				/* Set PAGEID inval value */
+1:	cmpw	r23,r4				/* Is this our entry? */
+	beq	skpinv				/* If so, skip the inval */
+	tlbwe	r3,r4,PPC44x_TLB_PAGEID		/* If not, inval the entry */
+skpinv:	addi	r4,r4,1				/* Increment */
+	cmpwi	r4,64				/* Are we done? */
+	bne	1b				/* If not, repeat */
+	isync					/* If so, context change */
+
+/*
+ * Configure and load pinned entry into TLB slot 63.
+ */
+#ifdef CONFIG_NONSTATIC_KERNEL
+	/*
+	 * In case of a NONSTATIC_KERNEL we reuse the TLB XLAT
+	 * entries of the initial mapping set by the boot loader.
+	 * The XLAT entry is stored in r25
+	 */
+
+	/* Read the XLAT entry for our current mapping */
+	tlbre	r25,r23,PPC44x_TLB_XLAT
+
+	lis	r3,KERNELBASE@h
+	ori	r3,r3,KERNELBASE@l
+
+	/* Use our current RPN entry */
+	mr	r4,r25
+#else
+
+	lis	r3,PAGE_OFFSET@h
+	ori	r3,r3,PAGE_OFFSET@l
+
+	/* Kernel is at the base of RAM */
+	li r4, 0			/* Load the kernel physical address */
+#endif
+
+	/* Load the kernel PID = 0 */
+	li	r0,0
+	mtspr	SPRN_PID,r0
+	sync
+
+	/* Initialize MMUCR */
+	li	r5,0
+	mtspr	SPRN_MMUCR,r5
+	sync
+
+	/* pageid fields */
+	clrrwi	r3,r3,10		/* Mask off the effective page number */
+	ori	r3,r3,PPC44x_TLB_VALID | PPC44x_TLB_256M
+
+	/* xlat fields */
+	clrrwi	r4,r4,10		/* Mask off the real page number */
+					/* ERPN is 0 for first 4GB page */
+
+	/* attrib fields */
+	/* Added guarded bit to protect against speculative loads/stores */
+	li	r5,0
+	ori	r5,r5,(PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G)
+
+        li      r0,63                    /* TLB slot 63 */
+
+	tlbwe	r3,r0,PPC44x_TLB_PAGEID	/* Load the pageid fields */
+	tlbwe	r4,r0,PPC44x_TLB_XLAT	/* Load the translation fields */
+	tlbwe	r5,r0,PPC44x_TLB_ATTRIB	/* Load the attrib/access fields */
+
+	/* Force context change */
+	mfmsr	r0
+	mtspr	SPRN_SRR1, r0
+	lis	r0,3f@h
+	ori	r0,r0,3f@l
+	mtspr	SPRN_SRR0,r0
+	sync
+	rfi
+
+	/* If necessary, invalidate original entry we used */
+3:	cmpwi	r23,63
+	beq	4f
+	li	r6,0
+	tlbwe   r6,r23,PPC44x_TLB_PAGEID
+	isync
+
+4:
+#ifdef CONFIG_PPC_EARLY_DEBUG_44x
+	/* Add UART mapping for early debug. */
+
+	/* pageid fields */
+	lis	r3,PPC44x_EARLY_DEBUG_VIRTADDR@h
+	ori	r3,r3,PPC44x_TLB_VALID|PPC44x_TLB_TS|PPC44x_TLB_64K
+
+	/* xlat fields */
+	lis	r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW@h
+	ori	r4,r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH
+
+	/* attrib fields */
+	li	r5,(PPC44x_TLB_SW|PPC44x_TLB_SR|PPC44x_TLB_I|PPC44x_TLB_G)
+        li      r0,62                    /* TLB slot 0 */
+
+	tlbwe	r3,r0,PPC44x_TLB_PAGEID
+	tlbwe	r4,r0,PPC44x_TLB_XLAT
+	tlbwe	r5,r0,PPC44x_TLB_ATTRIB
+
+	/* Force context change */
+	isync
+#endif /* CONFIG_PPC_EARLY_DEBUG_44x */
+
+	/* Establish the interrupt vector offsets */
+	SET_IVOR(0,  CriticalInput);
+	SET_IVOR(1,  MachineCheck);
+	SET_IVOR(2,  DataStorage);
+	SET_IVOR(3,  InstructionStorage);
+	SET_IVOR(4,  ExternalInput);
+	SET_IVOR(5,  Alignment);
+	SET_IVOR(6,  Program);
+	SET_IVOR(7,  FloatingPointUnavailable);
+	SET_IVOR(8,  SystemCall);
+	SET_IVOR(9,  AuxillaryProcessorUnavailable);
+	SET_IVOR(10, Decrementer);
+	SET_IVOR(11, FixedIntervalTimer);
+	SET_IVOR(12, WatchdogTimer);
+	SET_IVOR(13, DataTLBError44x);
+	SET_IVOR(14, InstructionTLBError44x);
+	SET_IVOR(15, DebugCrit);
+
+	b	head_start_common
+
+
+#ifdef CONFIG_PPC_47x
+
+#ifdef CONFIG_SMP
+
+/* Entry point for secondary 47x processors */
+_GLOBAL(start_secondary_47x)
+        mr      r24,r3          /* CPU number */
+
+	bl	init_cpu_state
+
+	/* Now we need to bolt the rest of kernel memory which
+	 * is done in C code. We must be careful because our task
+	 * struct or our stack can (and will probably) be out
+	 * of reach of the initial 256M TLB entry, so we use a
+	 * small temporary stack in .bss for that. This works
+	 * because only one CPU at a time can be in this code
+	 */
+	lis	r1,temp_boot_stack@h
+	ori	r1,r1,temp_boot_stack@l
+	addi	r1,r1,1024-STACK_FRAME_OVERHEAD
+	li	r0,0
+	stw	r0,0(r1)
+	bl	mmu_init_secondary
+
+	/* Now we can get our task struct and real stack pointer */
+
+	/* Get current_thread_info and current */
+	lis	r1,secondary_ti@ha
+	lwz	r1,secondary_ti@l(r1)
+	lwz	r2,TI_TASK(r1)
+
+	/* Current stack pointer */
+	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	li	r0,0
+	stw	r0,0(r1)
+
+	/* Kernel stack for exception entry in SPRG3 */
+	addi	r4,r2,THREAD	/* init task's THREAD */
+	mtspr	SPRN_SPRG3,r4
+
+	b	start_secondary
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Set up the initial MMU state for 44x
+ *
+ * We are still executing code at the virtual address
+ * mappings set by the firmware for the base of RAM.
+ */
+
+head_start_47x:
+	/* Load our current PID->MMUCR TID and MSR IS->MMUCR STS */
+	mfspr	r3,SPRN_PID			/* Get PID */
+	mfmsr	r4				/* Get MSR */
+	andi.	r4,r4,MSR_IS@l			/* TS=1? */
+	beq	1f				/* If not, leave STS=0 */
+	oris	r3,r3,PPC47x_MMUCR_STS@h	/* Set STS=1 */
+1:	mtspr	SPRN_MMUCR,r3			/* Put MMUCR */
+	sync
+
+	/* Find the entry we are running from */
+	bl	1f
+1:	mflr	r23
+	tlbsx	r23,0,r23
+	tlbre	r24,r23,0
+	tlbre	r25,r23,1
+	tlbre	r26,r23,2
+
+/*
+ * Cleanup time
+ */
+
+	/* Initialize MMUCR */
+	li	r5,0
+	mtspr	SPRN_MMUCR,r5
+	sync
+
+clear_all_utlb_entries:
+
+	#; Set initial values.
+
+	addis		r3,0,0x8000
+	addi		r4,0,0
+	addi		r5,0,0
+	b		clear_utlb_entry
+
+	#; Align the loop to speed things up.
+
+	.align		6
+
+clear_utlb_entry:
+
+	tlbwe		r4,r3,0
+	tlbwe		r5,r3,1
+	tlbwe		r5,r3,2
+	addis		r3,r3,0x2000
+	cmpwi		r3,0
+	bne		clear_utlb_entry
+	addis		r3,0,0x8000
+	addis		r4,r4,0x100
+	cmpwi		r4,0
+	bne		clear_utlb_entry
+
+	#; Restore original entry.
+
+	oris	r23,r23,0x8000  /* specify the way */
+	tlbwe		r24,r23,0
+	tlbwe		r25,r23,1
+	tlbwe		r26,r23,2
+
+/*
+ * Configure and load pinned entry into TLB for the kernel core
+ */
+
+	lis	r3,PAGE_OFFSET@h
+	ori	r3,r3,PAGE_OFFSET@l
+
+	/* Load the kernel PID = 0 */
+	li	r0,0
+	mtspr	SPRN_PID,r0
+	sync
+
+	/* Word 0 */
+	clrrwi	r3,r3,12		/* Mask off the effective page number */
+	ori	r3,r3,PPC47x_TLB0_VALID | PPC47x_TLB0_256M
+
+	/* Word 1 - use r25.  RPN is the same as the original entry */
+
+	/* Word 2 */
+	li	r5,0
+	ori	r5,r5,PPC47x_TLB2_S_RWX
+#ifdef CONFIG_SMP
+	ori	r5,r5,PPC47x_TLB2_M
+#endif
+
+	/* We write to way 0 and bolted 0 */
+	lis	r0,0x8800
+	tlbwe	r3,r0,0
+	tlbwe	r25,r0,1
+	tlbwe	r5,r0,2
+
+/*
+ * Configure SSPCR, ISPCR and USPCR for now to search everything, we can fix
+ * them up later
+ */
+	LOAD_REG_IMMEDIATE(r3, 0x9abcdef0)
+	mtspr	SPRN_SSPCR,r3
+	mtspr	SPRN_USPCR,r3
+	LOAD_REG_IMMEDIATE(r3, 0x12345670)
+	mtspr	SPRN_ISPCR,r3
+
+	/* Force context change */
+	mfmsr	r0
+	mtspr	SPRN_SRR1, r0
+	lis	r0,3f@h
+	ori	r0,r0,3f@l
+	mtspr	SPRN_SRR0,r0
+	sync
+	rfi
+
+	/* Invalidate original entry we used */
+3:
+	rlwinm	r24,r24,0,21,19 /* clear the "valid" bit */
+	tlbwe	r24,r23,0
+	addi	r24,0,0
+	tlbwe	r24,r23,1
+	tlbwe	r24,r23,2
+	isync                   /* Clear out the shadow TLB entries */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_44x
+	/* Add UART mapping for early debug. */
+
+	/* Word 0 */
+	lis	r3,PPC44x_EARLY_DEBUG_VIRTADDR@h
+	ori	r3,r3,PPC47x_TLB0_VALID | PPC47x_TLB0_TS | PPC47x_TLB0_1M
+
+	/* Word 1 */
+	lis	r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW@h
+	ori	r4,r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH
+
+	/* Word 2 */
+	li	r5,(PPC47x_TLB2_S_RW | PPC47x_TLB2_IMG)
+
+	/* Bolted in way 0, bolt slot 5, we -hope- we don't hit the same
+	 * congruence class as the kernel, we need to make sure of it at
+	 * some point
+	 */
+        lis	r0,0x8d00
+	tlbwe	r3,r0,0
+	tlbwe	r4,r0,1
+	tlbwe	r5,r0,2
+
+	/* Force context change */
+	isync
+#endif /* CONFIG_PPC_EARLY_DEBUG_44x */
+
+	/* Establish the interrupt vector offsets */
+	SET_IVOR(0,  CriticalInput);
+	SET_IVOR(1,  MachineCheckA);
+	SET_IVOR(2,  DataStorage);
+	SET_IVOR(3,  InstructionStorage);
+	SET_IVOR(4,  ExternalInput);
+	SET_IVOR(5,  Alignment);
+	SET_IVOR(6,  Program);
+	SET_IVOR(7,  FloatingPointUnavailable);
+	SET_IVOR(8,  SystemCall);
+	SET_IVOR(9,  AuxillaryProcessorUnavailable);
+	SET_IVOR(10, Decrementer);
+	SET_IVOR(11, FixedIntervalTimer);
+	SET_IVOR(12, WatchdogTimer);
+	SET_IVOR(13, DataTLBError47x);
+	SET_IVOR(14, InstructionTLBError47x);
+	SET_IVOR(15, DebugCrit);
+
+	/* We configure icbi to invalidate 128 bytes at a time since the
+	 * current 32-bit kernel code isn't too happy with icache != dcache
+	 * block size. We also disable the BTAC as this can cause errors
+	 * in some circumstances (see IBM Erratum 47).
+	 */
+	mfspr	r3,SPRN_CCR0
+	oris	r3,r3,0x0020
+	ori	r3,r3,0x0040
+	mtspr	SPRN_CCR0,r3
+	isync
+
+#endif /* CONFIG_PPC_47x */
+
+/*
+ * Here we are back to code that is common between 44x and 47x
+ *
+ * We proceed to further kernel initialization and return to the
+ * main kernel entry
+ */
+head_start_common:
+	/* Establish the interrupt vector base */
+	lis	r4,interrupt_base@h	/* IVPR only uses the high 16-bits */
+	mtspr	SPRN_IVPR,r4
+
+	/*
+	 * If the kernel was loaded at a non-zero 256 MB page, we need to
+	 * mask off the most significant 4 bits to get the relative address
+	 * from the start of physical memory
+	 */
+	rlwinm	r22,r22,0,4,31
+	addis	r22,r22,PAGE_OFFSET@h
+	mtlr	r22
+	isync
+	blr
+
+/*
+ * We put a few things here that have to be page-aligned. This stuff
+ * goes at the beginning of the data segment, which is page-aligned.
+ */
+	.data
+	.align	PAGE_SHIFT
+	.globl	sdata
+sdata:
+	.globl	empty_zero_page
+empty_zero_page:
+	.space	PAGE_SIZE
+EXPORT_SYMBOL(empty_zero_page)
+
+/*
+ * To support >32-bit physical addresses, we use an 8KB pgdir.
+ */
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PGD_TABLE_SIZE
+
+/*
+ * Room for two PTE pointers, usually the kernel and current user pointers
+ * to their respective root page table.
+ */
+abatron_pteptrs:
+	.space	8
+
+#ifdef CONFIG_SMP
+	.align	12
+temp_boot_stack:
+	.space	1024
+#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
new file mode 100644
index 000000000..4f7b225d7
--- /dev/null
+++ b/arch/powerpc/kernel/head_64.S
@@ -0,0 +1,1020 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *
+ *  Adapted for 64bit PowerPC by Dave Engebretsen, Peter Bergner, and
+ *    Mike Corrigan {engebret|bergner|mikejc}@us.ibm.com
+ *
+ *  This file contains the entry point for the 64-bit kernel along
+ *  with some early initialization code common to all 64-bit powerpc
+ *  variants.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/head-64.h>
+#include <asm/asm-offsets.h>
+#include <asm/bug.h>
+#include <asm/cputable.h>
+#include <asm/setup.h>
+#include <asm/hvcall.h>
+#include <asm/thread_info.h>
+#include <asm/firmware.h>
+#include <asm/page_64.h>
+#include <asm/irqflags.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/ptrace.h>
+#include <asm/hw_irq.h>
+#include <asm/cputhreads.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+
+/* The physical memory is laid out such that the secondary processor
+ * spin code sits at 0x0000...0x00ff. On server, the vectors follow
+ * using the layout described in exceptions-64s.S
+ */
+
+/*
+ * Entering into this code we make the following assumptions:
+ *
+ *  For pSeries or server processors:
+ *   1. The MMU is off & open firmware is running in real mode.
+ *   2. The primary CPU enters at __start.
+ *   3. If the RTAS supports "query-cpu-stopped-state", then secondary
+ *      CPUs will enter as directed by "start-cpu" RTAS call, which is
+ *      generic_secondary_smp_init, with PIR in r3.
+ *   4. Else the secondary CPUs will enter at secondary_hold (0x60) as
+ *      directed by the "start-cpu" RTS call, with PIR in r3.
+ * -or- For OPAL entry:
+ *   1. The MMU is off, processor in HV mode.
+ *   2. The primary CPU enters at 0 with device-tree in r3, OPAL base
+ *      in r8, and entry in r9 for debugging purposes.
+ *   3. Secondary CPUs enter as directed by OPAL_START_CPU call, which
+ *      is at generic_secondary_smp_init, with PIR in r3.
+ *
+ *  For Book3E processors:
+ *   1. The MMU is on running in AS0 in a state defined in ePAPR
+ *   2. The kernel is entered at __start
+ */
+
+OPEN_FIXED_SECTION(first_256B, 0x0, 0x100)
+USE_FIXED_SECTION(first_256B)
+	/*
+	 * Offsets are relative from the start of fixed section, and
+	 * first_256B starts at 0. Offsets are a bit easier to use here
+	 * than the fixed section entry macros.
+	 */
+	. = 0x0
+_GLOBAL(__start)
+	/* NOP this out unconditionally */
+BEGIN_FTR_SECTION
+	FIXUP_ENDIAN
+	b	__start_initialization_multiplatform
+END_FTR_SECTION(0, 1)
+
+	/* Catch branch to 0 in real mode */
+	trap
+
+	/* Secondary processors spin on this value until it becomes non-zero.
+	 * When non-zero, it contains the real address of the function the cpu
+	 * should jump to.
+	 */
+	.balign 8
+	.globl  __secondary_hold_spinloop
+__secondary_hold_spinloop:
+	.8byte	0x0
+
+	/* Secondary processors write this value with their cpu # */
+	/* after they enter the spin loop immediately below.	  */
+	.globl	__secondary_hold_acknowledge
+__secondary_hold_acknowledge:
+	.8byte	0x0
+
+#ifdef CONFIG_RELOCATABLE
+	/* This flag is set to 1 by a loader if the kernel should run
+	 * at the loaded address instead of the linked address.  This
+	 * is used by kexec-tools to keep the the kdump kernel in the
+	 * crash_kernel region.  The loader is responsible for
+	 * observing the alignment requirement.
+	 */
+
+#ifdef CONFIG_RELOCATABLE_TEST
+#define RUN_AT_LOAD_DEFAULT 1		/* Test relocation, do not copy to 0 */
+#else
+#define RUN_AT_LOAD_DEFAULT 0x72756e30  /* "run0" -- relocate to 0 by default */
+#endif
+
+	/* Do not move this variable as kexec-tools knows about it. */
+	. = 0x5c
+	.globl	__run_at_load
+__run_at_load:
+DEFINE_FIXED_SYMBOL(__run_at_load)
+	.long	RUN_AT_LOAD_DEFAULT
+#endif
+
+	. = 0x60
+/*
+ * The following code is used to hold secondary processors
+ * in a spin loop after they have entered the kernel, but
+ * before the bulk of the kernel has been relocated.  This code
+ * is relocated to physical address 0x60 before prom_init is run.
+ * All of it must fit below the first exception vector at 0x100.
+ * Use .globl here not _GLOBAL because we want __secondary_hold
+ * to be the actual text address, not a descriptor.
+ */
+	.globl	__secondary_hold
+__secondary_hold:
+	FIXUP_ENDIAN
+#ifndef CONFIG_PPC_BOOK3E
+	mfmsr	r24
+	ori	r24,r24,MSR_RI
+	mtmsrd	r24			/* RI on */
+#endif
+	/* Grab our physical cpu number */
+	mr	r24,r3
+	/* stash r4 for book3e */
+	mr	r25,r4
+
+	/* Tell the master cpu we're here */
+	/* Relocation is off & we are located at an address less */
+	/* than 0x100, so only need to grab low order offset.    */
+	std	r24,(ABS_ADDR(__secondary_hold_acknowledge))(0)
+	sync
+
+	li	r26,0
+#ifdef CONFIG_PPC_BOOK3E
+	tovirt(r26,r26)
+#endif
+	/* All secondary cpus wait here until told to start. */
+100:	ld	r12,(ABS_ADDR(__secondary_hold_spinloop))(r26)
+	cmpdi	0,r12,0
+	beq	100b
+
+#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
+#ifdef CONFIG_PPC_BOOK3E
+	tovirt(r12,r12)
+#endif
+	mtctr	r12
+	mr	r3,r24
+	/*
+	 * it may be the case that other platforms have r4 right to
+	 * begin with, this gives us some safety in case it is not
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	mr	r4,r25
+#else
+	li	r4,0
+#endif
+	/* Make sure that patched code is visible */
+	isync
+	bctr
+#else
+	BUG_OPCODE
+#endif
+CLOSE_FIXED_SECTION(first_256B)
+
+/* This value is used to mark exception frames on the stack. */
+	.section ".toc","aw"
+exception_marker:
+	.tc	ID_72656773_68657265[TC],0x7265677368657265
+	.previous
+
+/*
+ * On server, we include the exception vectors code here as it
+ * relies on absolute addressing which is only possible within
+ * this compilation unit
+ */
+#ifdef CONFIG_PPC_BOOK3S
+#include "exceptions-64s.S"
+#else
+OPEN_TEXT_SECTION(0x100)
+#endif
+
+USE_TEXT_SECTION()
+
+#ifdef CONFIG_PPC_BOOK3E
+/*
+ * The booting_thread_hwid holds the thread id we want to boot in cpu
+ * hotplug case. It is set by cpu hotplug code, and is invalid by default.
+ * The thread id is the same as the initial value of SPRN_PIR[THREAD_ID]
+ * bit field.
+ */
+	.globl	booting_thread_hwid
+booting_thread_hwid:
+	.long  INVALID_THREAD_HWID
+	.align 3
+/*
+ * start a thread in the same core
+ * input parameters:
+ * r3 = the thread physical id
+ * r4 = the entry point where thread starts
+ */
+_GLOBAL(book3e_start_thread)
+	LOAD_REG_IMMEDIATE(r5, MSR_KERNEL)
+	cmpwi	r3, 0
+	beq	10f
+	cmpwi	r3, 1
+	beq	11f
+	/* If the thread id is invalid, just exit. */
+	b	13f
+10:
+	MTTMR(TMRN_IMSR0, 5)
+	MTTMR(TMRN_INIA0, 4)
+	b	12f
+11:
+	MTTMR(TMRN_IMSR1, 5)
+	MTTMR(TMRN_INIA1, 4)
+12:
+	isync
+	li	r6, 1
+	sld	r6, r6, r3
+	mtspr	SPRN_TENS, r6
+13:
+	blr
+
+/*
+ * stop a thread in the same core
+ * input parameter:
+ * r3 = the thread physical id
+ */
+_GLOBAL(book3e_stop_thread)
+	cmpwi	r3, 0
+	beq	10f
+	cmpwi	r3, 1
+	beq	10f
+	/* If the thread id is invalid, just exit. */
+	b	13f
+10:
+	li	r4, 1
+	sld	r4, r4, r3
+	mtspr	SPRN_TENC, r4
+13:
+	blr
+
+_GLOBAL(fsl_secondary_thread_init)
+	mfspr	r4,SPRN_BUCSR
+
+	/* Enable branch prediction */
+	lis     r3,BUCSR_INIT@h
+	ori     r3,r3,BUCSR_INIT@l
+	mtspr   SPRN_BUCSR,r3
+	isync
+
+	/*
+	 * Fix PIR to match the linear numbering in the device tree.
+	 *
+	 * On e6500, the reset value of PIR uses the low three bits for
+	 * the thread within a core, and the upper bits for the core
+	 * number.  There are two threads per core, so shift everything
+	 * but the low bit right by two bits so that the cpu numbering is
+	 * continuous.
+	 *
+	 * If the old value of BUCSR is non-zero, this thread has run
+	 * before.  Thus, we assume we are coming from kexec or a similar
+	 * scenario, and PIR is already set to the correct value.  This
+	 * is a bit of a hack, but there are limited opportunities for
+	 * getting information into the thread and the alternatives
+	 * seemed like they'd be overkill.  We can't tell just by looking
+	 * at the old PIR value which state it's in, since the same value
+	 * could be valid for one thread out of reset and for a different
+	 * thread in Linux.
+	 */
+
+	mfspr	r3, SPRN_PIR
+	cmpwi	r4,0
+	bne	1f
+	rlwimi	r3, r3, 30, 2, 30
+	mtspr	SPRN_PIR, r3
+1:
+#endif
+
+_GLOBAL(generic_secondary_thread_init)
+	mr	r24,r3
+
+	/* turn on 64-bit mode */
+	bl	enable_64b_mode
+
+	/* get a valid TOC pointer, wherever we're mapped at */
+	bl	relative_toc
+	tovirt(r2,r2)
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* Book3E initialization */
+	mr	r3,r24
+	bl	book3e_secondary_thread_init
+#endif
+	b	generic_secondary_common_init
+
+/*
+ * On pSeries and most other platforms, secondary processors spin
+ * in the following code.
+ * At entry, r3 = this processor's number (physical cpu id)
+ *
+ * On Book3E, r4 = 1 to indicate that the initial TLB entry for
+ * this core already exists (setup via some other mechanism such
+ * as SCOM before entry).
+ */
+_GLOBAL(generic_secondary_smp_init)
+	FIXUP_ENDIAN
+	mr	r24,r3
+	mr	r25,r4
+
+	/* turn on 64-bit mode */
+	bl	enable_64b_mode
+
+	/* get a valid TOC pointer, wherever we're mapped at */
+	bl	relative_toc
+	tovirt(r2,r2)
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* Book3E initialization */
+	mr	r3,r24
+	mr	r4,r25
+	bl	book3e_secondary_core_init
+
+/*
+ * After common core init has finished, check if the current thread is the
+ * one we wanted to boot. If not, start the specified thread and stop the
+ * current thread.
+ */
+	LOAD_REG_ADDR(r4, booting_thread_hwid)
+	lwz     r3, 0(r4)
+	li	r5, INVALID_THREAD_HWID
+	cmpw	r3, r5
+	beq	20f
+
+	/*
+	 * The value of booting_thread_hwid has been stored in r3,
+	 * so make it invalid.
+	 */
+	stw	r5, 0(r4)
+
+	/*
+	 * Get the current thread id and check if it is the one we wanted.
+	 * If not, start the one specified in booting_thread_hwid and stop
+	 * the current thread.
+	 */
+	mfspr	r8, SPRN_TIR
+	cmpw	r3, r8
+	beq	20f
+
+	/* start the specified thread */
+	LOAD_REG_ADDR(r5, fsl_secondary_thread_init)
+	ld	r4, 0(r5)
+	bl	book3e_start_thread
+
+	/* stop the current thread */
+	mr	r3, r8
+	bl	book3e_stop_thread
+10:
+	b	10b
+20:
+#endif
+
+generic_secondary_common_init:
+	/* Set up a paca value for this processor. Since we have the
+	 * physical cpu id in r24, we need to search the pacas to find
+	 * which logical id maps to our physical one.
+	 */
+#ifndef CONFIG_SMP
+	b	kexec_wait		/* wait for next kernel if !SMP	 */
+#else
+	LOAD_REG_ADDR(r8, paca_ptrs)	/* Load paca_ptrs pointe	 */
+	ld	r8,0(r8)		/* Get base vaddr of array	 */
+	LOAD_REG_ADDR(r7, nr_cpu_ids)	/* Load nr_cpu_ids address       */
+	lwz	r7,0(r7)		/* also the max paca allocated 	 */
+	li	r5,0			/* logical cpu id                */
+1:
+	sldi	r9,r5,3			/* get paca_ptrs[] index from cpu id */
+	ldx	r13,r9,r8		/* r13 = paca_ptrs[cpu id]       */
+	lhz	r6,PACAHWCPUID(r13)	/* Load HW procid from paca      */
+	cmpw	r6,r24			/* Compare to our id             */
+	beq	2f
+	addi	r5,r5,1
+	cmpw	r5,r7			/* Check if more pacas exist     */
+	blt	1b
+
+	mr	r3,r24			/* not found, copy phys to r3	 */
+	b	kexec_wait		/* next kernel might do better	 */
+
+2:	SET_PACA(r13)
+#ifdef CONFIG_PPC_BOOK3E
+	addi	r12,r13,PACA_EXTLB	/* and TLB exc frame in another  */
+	mtspr	SPRN_SPRG_TLB_EXFRAME,r12
+#endif
+
+	/* From now on, r24 is expected to be logical cpuid */
+	mr	r24,r5
+
+	/* Create a temp kernel stack for use before relocation is on.	*/
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	/* See if we need to call a cpu state restore handler */
+	LOAD_REG_ADDR(r23, cur_cpu_spec)
+	ld	r23,0(r23)
+	ld	r12,CPU_SPEC_RESTORE(r23)
+	cmpdi	0,r12,0
+	beq	3f
+#ifdef PPC64_ELF_ABI_v1
+	ld	r12,0(r12)
+#endif
+	mtctr	r12
+	bctrl
+
+3:	LOAD_REG_ADDR(r3, spinning_secondaries) /* Decrement spinning_secondaries */
+	lwarx	r4,0,r3
+	subi	r4,r4,1
+	stwcx.	r4,0,r3
+	bne	3b
+	isync
+
+4:	HMT_LOW
+	lbz	r23,PACAPROCSTART(r13)	/* Test if this processor should */
+					/* start.			 */
+	cmpwi	0,r23,0
+	beq	4b			/* Loop until told to go	 */
+
+	sync				/* order paca.run and cur_cpu_spec */
+	isync				/* In case code patching happened */
+
+	b	__secondary_start
+#endif /* SMP */
+
+/*
+ * Turn the MMU off.
+ * Assumes we're mapped EA == RA if the MMU is on.
+ */
+#ifdef CONFIG_PPC_BOOK3S
+__mmu_off:
+	mfmsr	r3
+	andi.	r0,r3,MSR_IR|MSR_DR
+	beqlr
+	mflr	r4
+	andc	r3,r3,r0
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	sync
+	rfid
+	b	.	/* prevent speculative execution */
+#endif
+
+
+/*
+ * Here is our main kernel entry point. We support currently 2 kind of entries
+ * depending on the value of r5.
+ *
+ *   r5 != NULL -> OF entry, we go to prom_init, "legacy" parameter content
+ *                 in r3...r7
+ *   
+ *   r5 == NULL -> kexec style entry. r3 is a physical pointer to the
+ *                 DT block, r4 is a physical pointer to the kernel itself
+ *
+ */
+__start_initialization_multiplatform:
+	/* Make sure we are running in 64 bits mode */
+	bl	enable_64b_mode
+
+	/* Get TOC pointer (current runtime address) */
+	bl	relative_toc
+
+	/* find out where we are now */
+	bcl	20,31,$+4
+0:	mflr	r26			/* r26 = runtime addr here */
+	addis	r26,r26,(_stext - 0b)@ha
+	addi	r26,r26,(_stext - 0b)@l	/* current runtime base addr */
+
+	/*
+	 * Are we booted from a PROM Of-type client-interface ?
+	 */
+	cmpldi	cr0,r5,0
+	beq	1f
+	b	__boot_from_prom		/* yes -> prom */
+1:
+	/* Save parameters */
+	mr	r31,r3
+	mr	r30,r4
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+	/* Save OPAL entry */
+	mr	r28,r8
+	mr	r29,r9
+#endif
+
+#ifdef CONFIG_PPC_BOOK3E
+	bl	start_initialization_book3e
+	b	__after_prom_start
+#else
+	/* Setup some critical 970 SPRs before switching MMU off */
+	mfspr	r0,SPRN_PVR
+	srwi	r0,r0,16
+	cmpwi	r0,0x39		/* 970 */
+	beq	1f
+	cmpwi	r0,0x3c		/* 970FX */
+	beq	1f
+	cmpwi	r0,0x44		/* 970MP */
+	beq	1f
+	cmpwi	r0,0x45		/* 970GX */
+	bne	2f
+1:	bl	__cpu_preinit_ppc970
+2:
+
+	/* Switch off MMU if not already off */
+	bl	__mmu_off
+	b	__after_prom_start
+#endif /* CONFIG_PPC_BOOK3E */
+
+__boot_from_prom:
+#ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
+	/* Save parameters */
+	mr	r31,r3
+	mr	r30,r4
+	mr	r29,r5
+	mr	r28,r6
+	mr	r27,r7
+
+	/*
+	 * Align the stack to 16-byte boundary
+	 * Depending on the size and layout of the ELF sections in the initial
+	 * boot binary, the stack pointer may be unaligned on PowerMac
+	 */
+	rldicr	r1,r1,0,59
+
+#ifdef CONFIG_RELOCATABLE
+	/* Relocate code for where we are now */
+	mr	r3,r26
+	bl	relocate
+#endif
+
+	/* Restore parameters */
+	mr	r3,r31
+	mr	r4,r30
+	mr	r5,r29
+	mr	r6,r28
+	mr	r7,r27
+
+	/* Do all of the interaction with OF client interface */
+	mr	r8,r26
+	bl	prom_init
+#endif /* #CONFIG_PPC_OF_BOOT_TRAMPOLINE */
+
+	/* We never return. We also hit that trap if trying to boot
+	 * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */
+	trap
+
+__after_prom_start:
+#ifdef CONFIG_RELOCATABLE
+	/* process relocations for the final address of the kernel */
+	lis	r25,PAGE_OFFSET@highest	/* compute virtual base of kernel */
+	sldi	r25,r25,32
+#if defined(CONFIG_PPC_BOOK3E)
+	tovirt(r26,r26)		/* on booke, we already run at PAGE_OFFSET */
+#endif
+	lwz	r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26)
+#if defined(CONFIG_PPC_BOOK3E)
+	tophys(r26,r26)
+#endif
+	cmplwi	cr0,r7,1	/* flagged to stay where we are ? */
+	bne	1f
+	add	r25,r25,r26
+1:	mr	r3,r25
+	bl	relocate
+#if defined(CONFIG_PPC_BOOK3E)
+	/* IVPR needs to be set after relocation. */
+	bl	init_core_book3e
+#endif
+#endif
+
+/*
+ * We need to run with _stext at physical address PHYSICAL_START.
+ * This will leave some code in the first 256B of
+ * real memory, which are reserved for software use.
+ *
+ * Note: This process overwrites the OF exception vectors.
+ */
+	li	r3,0			/* target addr */
+#ifdef CONFIG_PPC_BOOK3E
+	tovirt(r3,r3)		/* on booke, we already run at PAGE_OFFSET */
+#endif
+	mr.	r4,r26			/* In some cases the loader may  */
+#if defined(CONFIG_PPC_BOOK3E)
+	tovirt(r4,r4)
+#endif
+	beq	9f			/* have already put us at zero */
+	li	r6,0x100		/* Start offset, the first 0x100 */
+					/* bytes were copied earlier.	 */
+
+#ifdef CONFIG_RELOCATABLE
+/*
+ * Check if the kernel has to be running as relocatable kernel based on the
+ * variable __run_at_load, if it is set the kernel is treated as relocatable
+ * kernel, otherwise it will be moved to PHYSICAL_START
+ */
+#if defined(CONFIG_PPC_BOOK3E)
+	tovirt(r26,r26)		/* on booke, we already run at PAGE_OFFSET */
+#endif
+	lwz	r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26)
+	cmplwi	cr0,r7,1
+	bne	3f
+
+#ifdef CONFIG_PPC_BOOK3E
+	LOAD_REG_ADDR(r5, __end_interrupts)
+	LOAD_REG_ADDR(r11, _stext)
+	sub	r5,r5,r11
+#else
+	/* just copy interrupts */
+	LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
+#endif
+	b	5f
+3:
+#endif
+	/* # bytes of memory to copy */
+	lis	r5,(ABS_ADDR(copy_to_here))@ha
+	addi	r5,r5,(ABS_ADDR(copy_to_here))@l
+
+	bl	copy_and_flush		/* copy the first n bytes	 */
+					/* this includes the code being	 */
+					/* executed here.		 */
+	/* Jump to the copy of this code that we just made */
+	addis	r8,r3,(ABS_ADDR(4f))@ha
+	addi	r12,r8,(ABS_ADDR(4f))@l
+	mtctr	r12
+	bctr
+
+.balign 8
+p_end: .8byte _end - copy_to_here
+
+4:
+	/*
+	 * Now copy the rest of the kernel up to _end, add
+	 * _end - copy_to_here to the copy limit and run again.
+	 */
+	addis   r8,r26,(ABS_ADDR(p_end))@ha
+	ld      r8,(ABS_ADDR(p_end))@l(r8)
+	add	r5,r5,r8
+5:	bl	copy_and_flush		/* copy the rest */
+
+9:	b	start_here_multiplatform
+
+/*
+ * Copy routine used to copy the kernel to start at physical address 0
+ * and flush and invalidate the caches as needed.
+ * r3 = dest addr, r4 = source addr, r5 = copy limit, r6 = start offset
+ * on exit, r3, r4, r5 are unchanged, r6 is updated to be >= r5.
+ *
+ * Note: this routine *only* clobbers r0, r6 and lr
+ */
+_GLOBAL(copy_and_flush)
+	addi	r5,r5,-8
+	addi	r6,r6,-8
+4:	li	r0,8			/* Use the smallest common	*/
+					/* denominator cache line	*/
+					/* size.  This results in	*/
+					/* extra cache line flushes	*/
+					/* but operation is correct.	*/
+					/* Can't get cache line size	*/
+					/* from NACA as it is being	*/
+					/* moved too.			*/
+
+	mtctr	r0			/* put # words/line in ctr	*/
+3:	addi	r6,r6,8			/* copy a cache line		*/
+	ldx	r0,r6,r4
+	stdx	r0,r6,r3
+	bdnz	3b
+	dcbst	r6,r3			/* write it to memory		*/
+	sync
+	icbi	r6,r3			/* flush the icache line	*/
+	cmpld	0,r6,r5
+	blt	4b
+	sync
+	addi	r5,r5,8
+	addi	r6,r6,8
+	isync
+	blr
+
+.align 8
+copy_to_here:
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PPC_PMAC
+/*
+ * On PowerMac, secondary processors starts from the reset vector, which
+ * is temporarily turned into a call to one of the functions below.
+ */
+	.section ".text";
+	.align 2 ;
+
+	.globl	__secondary_start_pmac_0
+__secondary_start_pmac_0:
+	/* NB the entries for cpus 0, 1, 2 must each occupy 8 bytes. */
+	li	r24,0
+	b	1f
+	li	r24,1
+	b	1f
+	li	r24,2
+	b	1f
+	li	r24,3
+1:
+	
+_GLOBAL(pmac_secondary_start)
+	/* turn on 64-bit mode */
+	bl	enable_64b_mode
+
+	li	r0,0
+	mfspr	r3,SPRN_HID4
+	rldimi	r3,r0,40,23	/* clear bit 23 (rm_ci) */
+	sync
+	mtspr	SPRN_HID4,r3
+	isync
+	sync
+	slbia
+
+	/* get TOC pointer (real address) */
+	bl	relative_toc
+	tovirt(r2,r2)
+
+	/* Copy some CPU settings from CPU 0 */
+	bl	__restore_cpu_ppc970
+
+	/* pSeries do that early though I don't think we really need it */
+	mfmsr	r3
+	ori	r3,r3,MSR_RI
+	mtmsrd	r3			/* RI on */
+
+	/* Set up a paca value for this processor. */
+	LOAD_REG_ADDR(r4,paca_ptrs)	/* Load paca pointer		*/
+	ld	r4,0(r4)		/* Get base vaddr of paca_ptrs array */
+	sldi	r5,r24,3		/* get paca_ptrs[] index from cpu id */
+	ldx	r13,r5,r4		/* r13 = paca_ptrs[cpu id]       */
+	SET_PACA(r13)			/* Save vaddr of paca in an SPRG*/
+
+	/* Mark interrupts soft and hard disabled (they might be enabled
+	 * in the PACA when doing hotplug)
+	 */
+	li	r0,IRQS_DISABLED
+	stb	r0,PACAIRQSOFTMASK(r13)
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+
+	/* Create a temp kernel stack for use before relocation is on.	*/
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	b	__secondary_start
+
+#endif /* CONFIG_PPC_PMAC */
+
+/*
+ * This function is called after the master CPU has released the
+ * secondary processors.  The execution environment is relocation off.
+ * The paca for this processor has the following fields initialized at
+ * this point:
+ *   1. Processor number
+ *   2. Segment table pointer (virtual address)
+ * On entry the following are set:
+ *   r1	       = stack pointer (real addr of temp stack)
+ *   r24       = cpu# (in Linux terms)
+ *   r13       = paca virtual address
+ *   SPRG_PACA = paca virtual address
+ */
+	.section ".text";
+	.align 2 ;
+
+	.globl	__secondary_start
+__secondary_start:
+	/* Set thread priority to MEDIUM */
+	HMT_MEDIUM
+
+	/* Initialize the kernel stack */
+	LOAD_REG_ADDR(r3, current_set)
+	sldi	r28,r24,3		/* get current_set[cpu#]	 */
+	ldx	r14,r3,r28
+	addi	r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	std	r14,PACAKSAVE(r13)
+
+	/* Do early setup for that CPU (SLB and hash table pointer) */
+	bl	early_setup_secondary
+
+	/*
+	 * setup the new stack pointer, but *don't* use this until
+	 * translation is on.
+	 */
+	mr	r1, r14
+
+	/* Clear backchain so we get nice backtraces */
+	li	r7,0
+	mtlr	r7
+
+	/* Mark interrupts soft and hard disabled (they might be enabled
+	 * in the PACA when doing hotplug)
+	 */
+	li	r7,IRQS_DISABLED
+	stb	r7,PACAIRQSOFTMASK(r13)
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+
+	/* enable MMU and jump to start_secondary */
+	LOAD_REG_ADDR(r3, start_secondary_prolog)
+	LOAD_REG_IMMEDIATE(r4, MSR_KERNEL)
+
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	RFI
+	b	.	/* prevent speculative execution */
+
+/* 
+ * Running with relocation on at this point.  All we want to do is
+ * zero the stack back-chain pointer and get the TOC virtual address
+ * before going into C code.
+ */
+start_secondary_prolog:
+	ld	r2,PACATOC(r13)
+	li	r3,0
+	std	r3,0(r1)		/* Zero the stack frame pointer	*/
+	bl	start_secondary
+	b	.
+/*
+ * Reset stack pointer and call start_secondary
+ * to continue with online operation when woken up
+ * from cede in cpu offline.
+ */
+_GLOBAL(start_secondary_resume)
+	ld	r1,PACAKSAVE(r13)	/* Reload kernel stack pointer */
+	li	r3,0
+	std	r3,0(r1)		/* Zero the stack frame pointer	*/
+	bl	start_secondary
+	b	.
+#endif
+
+/*
+ * This subroutine clobbers r11 and r12
+ */
+enable_64b_mode:
+	mfmsr	r11			/* grab the current MSR */
+#ifdef CONFIG_PPC_BOOK3E
+	oris	r11,r11,0x8000		/* CM bit set, we'll set ICM later */
+	mtmsr	r11
+#else /* CONFIG_PPC_BOOK3E */
+	li	r12,(MSR_64BIT | MSR_ISF)@highest
+	sldi	r12,r12,48
+	or	r11,r11,r12
+	mtmsrd	r11
+	isync
+#endif
+	blr
+
+/*
+ * This puts the TOC pointer into r2, offset by 0x8000 (as expected
+ * by the toolchain).  It computes the correct value for wherever we
+ * are running at the moment, using position-independent code.
+ *
+ * Note: The compiler constructs pointers using offsets from the
+ * TOC in -mcmodel=medium mode. After we relocate to 0 but before
+ * the MMU is on we need our TOC to be a virtual address otherwise
+ * these pointers will be real addresses which may get stored and
+ * accessed later with the MMU on. We use tovirt() at the call
+ * sites to handle this.
+ */
+_GLOBAL(relative_toc)
+	mflr	r0
+	bcl	20,31,$+4
+0:	mflr	r11
+	ld	r2,(p_toc - 0b)(r11)
+	add	r2,r2,r11
+	mtlr	r0
+	blr
+
+.balign 8
+p_toc:	.8byte	__toc_start + 0x8000 - 0b
+
+/*
+ * This is where the main kernel code starts.
+ */
+__REF
+start_here_multiplatform:
+	/* set up the TOC */
+	bl      relative_toc
+	tovirt(r2,r2)
+
+	/* Clear out the BSS. It may have been done in prom_init,
+	 * already but that's irrelevant since prom_init will soon
+	 * be detached from the kernel completely. Besides, we need
+	 * to clear it now for kexec-style entry.
+	 */
+	LOAD_REG_ADDR(r11,__bss_stop)
+	LOAD_REG_ADDR(r8,__bss_start)
+	sub	r11,r11,r8		/* bss size			*/
+	addi	r11,r11,7		/* round up to an even double word */
+	srdi.	r11,r11,3		/* shift right by 3		*/
+	beq	4f
+	addi	r8,r8,-8
+	li	r0,0
+	mtctr	r11			/* zero this many doublewords	*/
+3:	stdu	r0,8(r8)
+	bdnz	3b
+4:
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+	/* Setup OPAL entry */
+	LOAD_REG_ADDR(r11, opal)
+	std	r28,0(r11);
+	std	r29,8(r11);
+#endif
+
+#ifndef CONFIG_PPC_BOOK3E
+	mfmsr	r6
+	ori	r6,r6,MSR_RI
+	mtmsrd	r6			/* RI on */
+#endif
+
+#ifdef CONFIG_RELOCATABLE
+	/* Save the physical address we're running at in kernstart_addr */
+	LOAD_REG_ADDR(r4, kernstart_addr)
+	clrldi	r0,r25,2
+	std	r0,0(r4)
+#endif
+
+	/* set up a stack pointer */
+	LOAD_REG_ADDR(r3,init_thread_union)
+	LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
+	add	r1,r3,r1
+	li	r0,0
+	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
+
+	/*
+	 * Do very early kernel initializations, including initial hash table
+	 * and SLB setup before we turn on relocation.
+	 */
+
+	/* Restore parameters passed from prom_init/kexec */
+	mr	r3,r31
+	LOAD_REG_ADDR(r12, DOTSYM(early_setup))
+	mtctr	r12
+	bctrl		/* also sets r13 and SPRG_PACA */
+
+	LOAD_REG_ADDR(r3, start_here_common)
+	ld	r4,PACAKMSR(r13)
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	RFI
+	b	.	/* prevent speculative execution */
+
+	.previous
+	/* This is where all platforms converge execution */
+
+start_here_common:
+	/* relocation is on at this point */
+	std	r1,PACAKSAVE(r13)
+
+	/* Load the TOC (virtual address) */
+	ld	r2,PACATOC(r13)
+
+	/* Mark interrupts soft and hard disabled (they might be enabled
+	 * in the PACA when doing hotplug)
+	 */
+	li	r0,IRQS_DISABLED
+	stb	r0,PACAIRQSOFTMASK(r13)
+	li	r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+
+	/* Generic kernel entry */
+	bl	start_kernel
+
+	/* Not reached */
+	BUG_OPCODE
+
+/*
+ * We put a few things here that have to be page-aligned.
+ * This stuff goes at the beginning of the bss, which is page-aligned.
+ */
+	.section ".bss"
+/*
+ * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K.
+ * We will need to find a better way to fix this
+ */
+	.align	16
+
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PGD_TABLE_SIZE
+
+	.globl	empty_zero_page
+empty_zero_page:
+	.space	PAGE_SIZE
+EXPORT_SYMBOL(empty_zero_page)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
new file mode 100644
index 000000000..08c16aa0d
--- /dev/null
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -0,0 +1,1070 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *  MPC8xx modifications by Dan Malek
+ *    Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
+ *
+ *  This file contains low-level support and setup for PowerPC 8xx
+ *  embedded processors, including trap and interrupt dispatch.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/cache.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+
+#if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000
+/* By simply checking Address >= 0x80000000, we know if its a kernel address */
+#define SIMPLE_KERNEL_ADDRESS		1
+#endif
+
+/*
+ * We need an ITLB miss handler for kernel addresses if:
+ * - Either we have modules
+ * - Or we have not pinned the first 8M
+ */
+#if defined(CONFIG_MODULES) || !defined(CONFIG_PIN_TLB_TEXT) || \
+    defined(CONFIG_DEBUG_PAGEALLOC)
+#define ITLB_MISS_KERNEL	1
+#endif
+
+/*
+ * Value for the bits that have fixed value in RPN entries.
+ * Also used for tagging DAR for DTLBerror.
+ */
+#define RPN_PATTERN	0x00f0
+
+#define PAGE_SHIFT_512K		19
+#define PAGE_SHIFT_8M		23
+
+	__HEAD
+_ENTRY(_stext);
+_ENTRY(_start);
+
+/* MPC8xx
+ * This port was done on an MBX board with an 860.  Right now I only
+ * support an ELF compressed (zImage) boot from EPPC-Bug because the
+ * code there loads up some registers before calling us:
+ *   r3: ptr to board info data
+ *   r4: initrd_start or if no initrd then 0
+ *   r5: initrd_end - unused if r4 is 0
+ *   r6: Start of command line string
+ *   r7: End of command line string
+ *
+ * I decided to use conditional compilation instead of checking PVR and
+ * adding more processor specific branches around code I don't need.
+ * Since this is an embedded processor, I also appreciate any memory
+ * savings I can get.
+ *
+ * The MPC8xx does not have any BATs, but it supports large page sizes.
+ * We first initialize the MMU to support 8M byte pages, then load one
+ * entry into each of the instruction and data TLBs to map the first
+ * 8M 1:1.  I also mapped an additional I/O space 1:1 so we can get to
+ * the "internal" processor registers before MMU_init is called.
+ *
+ *	-- Dan
+ */
+	.globl	__start
+__start:
+	mr	r31,r3			/* save device tree ptr */
+
+	/* We have to turn on the MMU right away so we get cache modes
+	 * set correctly.
+	 */
+	bl	initial_mmu
+
+/* We now have the lower 8 Meg mapped into TLB entries, and the caches
+ * ready to work.
+ */
+
+turn_on_mmu:
+	mfmsr	r0
+	ori	r0,r0,MSR_DR|MSR_IR
+	mtspr	SPRN_SRR1,r0
+	lis	r0,start_here@h
+	ori	r0,r0,start_here@l
+	mtspr	SPRN_SRR0,r0
+	rfi				/* enables MMU */
+
+/*
+ * Exception entry code.  This code runs with address translation
+ * turned off, i.e. using physical addresses.
+ * We assume sprg3 has the physical address of the current
+ * task's thread_struct.
+ */
+#define EXCEPTION_PROLOG	\
+	mtspr	SPRN_SPRG_SCRATCH0, r10;	\
+	mtspr	SPRN_SPRG_SCRATCH1, r11;	\
+	mfcr	r10;		\
+	EXCEPTION_PROLOG_1;	\
+	EXCEPTION_PROLOG_2
+
+#define EXCEPTION_PROLOG_1	\
+	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel */ \
+	andi.	r11,r11,MSR_PR;	\
+	tophys(r11,r1);			/* use tophys(r1) if kernel */ \
+	beq	1f;		\
+	mfspr	r11,SPRN_SPRG_THREAD;	\
+	lwz	r11,THREAD_INFO-THREAD(r11);	\
+	addi	r11,r11,THREAD_SIZE;	\
+	tophys(r11,r11);	\
+1:	subi	r11,r11,INT_FRAME_SIZE	/* alloc exc. frame */
+
+
+#define EXCEPTION_PROLOG_2	\
+	stw	r10,_CCR(r11);		/* save registers */ \
+	stw	r12,GPR12(r11);	\
+	stw	r9,GPR9(r11);	\
+	mfspr	r10,SPRN_SPRG_SCRATCH0;	\
+	stw	r10,GPR10(r11);	\
+	mfspr	r12,SPRN_SPRG_SCRATCH1;	\
+	stw	r12,GPR11(r11);	\
+	mflr	r10;		\
+	stw	r10,_LINK(r11);	\
+	mfspr	r12,SPRN_SRR0;	\
+	mfspr	r9,SPRN_SRR1;	\
+	stw	r1,GPR1(r11);	\
+	stw	r1,0(r11);	\
+	tovirt(r1,r11);			/* set new kernel sp */	\
+	li	r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \
+	mtmsr	r10;		\
+	stw	r0,GPR0(r11);	\
+	SAVE_4GPRS(3, r11);	\
+	SAVE_2GPRS(7, r11)
+
+/*
+ * Note: code which follows this uses cr0.eq (set if from kernel),
+ * r11, r12 (SRR0), and r9 (SRR1).
+ *
+ * Note2: once we have set r1 we are in a position to take exceptions
+ * again, and we could thus set MSR:RI at that point.
+ */
+
+/*
+ * Exception vectors.
+ */
+#define EXCEPTION(n, label, hdlr, xfer)		\
+	. = n;					\
+label:						\
+	EXCEPTION_PROLOG;			\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
+	xfer(n, hdlr)
+
+#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
+	li	r10,trap;					\
+	stw	r10,_TRAP(r11);					\
+	li	r10,MSR_KERNEL;					\
+	copyee(r10, r9);					\
+	bl	tfer;						\
+i##n:								\
+	.long	hdlr;						\
+	.long	ret
+
+#define COPY_EE(d, s)		rlwimi d,s,0,16,16
+#define NOCOPY(d, s)
+
+#define EXC_XFER_STD(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full,	\
+			  ret_from_except_full)
+
+#define EXC_XFER_LITE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
+#define EXC_XFER_EE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_EE_LITE(n, hdlr)	\
+	EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \
+			  ret_from_except)
+
+/* System reset */
+	EXCEPTION(0x100, Reset, system_reset_exception, EXC_XFER_STD)
+
+/* Machine check */
+	. = 0x200
+MachineCheck:
+	EXCEPTION_PROLOG
+	mfspr r4,SPRN_DAR
+	stw r4,_DAR(r11)
+	li r5,RPN_PATTERN
+	mtspr SPRN_DAR,r5	/* Tag DAR, to be used in DTLB Error */
+	mfspr r5,SPRN_DSISR
+	stw r5,_DSISR(r11)
+	addi r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_STD(0x200, machine_check_exception)
+
+/* Data access exception.
+ * This is "never generated" by the MPC8xx.
+ */
+	. = 0x300
+DataAccess:
+
+/* Instruction access exception.
+ * This is "never generated" by the MPC8xx.
+ */
+	. = 0x400
+InstructionAccess:
+
+/* External interrupt */
+	EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
+
+/* Alignment exception */
+	. = 0x600
+Alignment:
+	EXCEPTION_PROLOG
+	mfspr	r4,SPRN_DAR
+	stw	r4,_DAR(r11)
+	li	r5,RPN_PATTERN
+	mtspr	SPRN_DAR,r5	/* Tag DAR, to be used in DTLB Error */
+	mfspr	r5,SPRN_DSISR
+	stw	r5,_DSISR(r11)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE(0x600, alignment_exception)
+
+/* Program check exception */
+	EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD)
+
+/* No FPU on MPC8xx.  This exception is not supposed to happen.
+*/
+	EXCEPTION(0x800, FPUnavailable, unknown_exception, EXC_XFER_STD)
+
+/* Decrementer */
+	EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE)
+
+	EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_EE)
+
+/* System call */
+	. = 0xc00
+SystemCall:
+	EXCEPTION_PROLOG
+	EXC_XFER_EE_LITE(0xc00, DoSyscall)
+
+/* Single step - not used on 601 */
+	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
+	EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_EE)
+
+/* On the MPC8xx, this is a software emulation interrupt.  It occurs
+ * for all unimplemented and illegal instructions.
+ */
+	EXCEPTION(0x1000, SoftEmu, emulation_assist_interrupt, EXC_XFER_STD)
+
+	. = 0x1100
+/*
+ * For the MPC8xx, this is a software tablewalk to load the instruction
+ * TLB.  The task switch loads the M_TW register with the pointer to the first
+ * level table.
+ * If we discover there is no second level table (value is zero) or if there
+ * is an invalid pte, we load that into the TLB, which causes another fault
+ * into the TLB Error interrupt where we can handle such problems.
+ * We have to use the MD_xxx registers for the tablewalk because the
+ * equivalent MI_xxx registers only perform the attribute functions.
+ */
+
+#ifdef CONFIG_8xx_CPU15
+#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)	\
+	addi	tmp, addr, PAGE_SIZE;	\
+	tlbie	tmp;			\
+	addi	tmp, addr, -PAGE_SIZE;	\
+	tlbie	tmp
+#else
+#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)
+#endif
+
+InstructionTLBMiss:
+	mtspr	SPRN_SPRG_SCRATCH0, r10
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+	mtspr	SPRN_SPRG_SCRATCH2, r12
+#endif
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
+	INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
+	/* Only modules will cause ITLB Misses as we always
+	 * pin the first 8MB of kernel memory */
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+	mfcr	r12
+#endif
+#ifdef ITLB_MISS_KERNEL
+#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
+	andis.	r11, r10, 0x8000	/* Address >= 0x80000000 */
+#else
+	rlwinm	r11, r10, 16, 0xfff8
+	cmpli	cr0, r11, PAGE_OFFSET@h
+#ifndef CONFIG_PIN_TLB_TEXT
+	/* It is assumed that kernel code fits into the first 8M page */
+_ENTRY(ITLBMiss_cmp)
+	cmpli	cr7, r11, (PAGE_OFFSET + 0x0800000)@h
+#endif
+#endif
+#endif
+	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
+#ifdef ITLB_MISS_KERNEL
+#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
+	beq+	3f
+#else
+	blt+	3f
+#endif
+#ifndef CONFIG_PIN_TLB_TEXT
+	blt	cr7, ITLBMissLinear
+#endif
+	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
+3:
+#endif
+	/* Insert level 1 index */
+	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
+	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
+
+	/* Extract level 2 index */
+	rlwinm	r10, r10, 32 - (PAGE_SHIFT - 2), 32 - PAGE_SHIFT, 29
+#ifdef CONFIG_HUGETLB_PAGE
+	mtcr	r11
+	bt-	28, 10f		/* bit 28 = Large page (8M) */
+	bt-	29, 20f		/* bit 29 = Large page (8M or 512k) */
+#endif
+	rlwimi	r10, r11, 0, 0, 32 - PAGE_SHIFT - 1	/* Add level 2 base */
+	lwz	r10, 0(r10)	/* Get the pte */
+4:
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+	mtcr	r12
+#endif
+	/* Load the MI_TWC with the attributes for this "segment." */
+	mtspr	SPRN_MI_TWC, r11	/* Set segment attributes */
+
+	rlwinm	r11, r10, 32-7, _PAGE_PRESENT
+	and	r11, r11, r10
+	rlwimi	r10, r11, 0, _PAGE_PRESENT
+	li	r11, RPN_PATTERN | 0x200
+	/* The Linux PTE won't go exactly into the MMU TLB.
+	 * Software indicator bits 20 and 23 must be clear.
+	 * Software indicator bits 22, 24, 25, 26, and 27 must be
+	 * set.  All other Linux PTE bits control the behavior
+	 * of the MMU.
+	 */
+	rlwimi	r11, r10, 4, 0x0400	/* Copy _PAGE_EXEC into bit 21 */
+	rlwimi	r10, r11, 0, 0x0ff0	/* Set 22, 24-27, clear 20,23 */
+	mtspr	SPRN_MI_RPN, r10	/* Update TLB entry */
+
+	/* Restore registers */
+_ENTRY(itlb_miss_exit_1)
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+#endif
+	rfi
+#ifdef CONFIG_PERF_EVENTS
+_ENTRY(itlb_miss_perf)
+	lis	r10, (itlb_miss_counter - PAGE_OFFSET)@ha
+	lwz	r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
+	addi	r11, r11, 1
+	stw	r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
+#endif
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+#endif
+	rfi
+
+#ifdef CONFIG_HUGETLB_PAGE
+10:	/* 8M pages */
+#ifdef CONFIG_PPC_16K_PAGES
+	/* Extract level 2 index */
+	rlwinm	r10, r10, 32 - (PAGE_SHIFT_8M - PAGE_SHIFT), 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1), 29
+	/* Add level 2 base */
+	rlwimi	r10, r11, 0, 0, 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1) - 1
+#else
+	/* Level 2 base */
+	rlwinm	r10, r11, 0, ~HUGEPD_SHIFT_MASK
+#endif
+	lwz	r10, 0(r10)	/* Get the pte */
+	b	4b
+
+20:	/* 512k pages */
+	/* Extract level 2 index */
+	rlwinm	r10, r10, 32 - (PAGE_SHIFT_512K - PAGE_SHIFT), 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1), 29
+	/* Add level 2 base */
+	rlwimi	r10, r11, 0, 0, 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1) - 1
+	lwz	r10, 0(r10)	/* Get the pte */
+	b	4b
+#endif
+
+	. = 0x1200
+DataStoreTLBMiss:
+	mtspr	SPRN_SPRG_SCRATCH0, r10
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+	mtspr	SPRN_SPRG_SCRATCH2, r12
+	mfcr	r12
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	mfspr	r10, SPRN_MD_EPN
+	rlwinm	r11, r10, 16, 0xfff8
+	cmpli	cr0, r11, PAGE_OFFSET@h
+	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
+	blt+	3f
+	rlwinm	r11, r10, 16, 0xfff8
+#ifndef CONFIG_PIN_TLB_IMMR
+	cmpli	cr0, r11, VIRT_IMMR_BASE@h
+#endif
+_ENTRY(DTLBMiss_cmp)
+	cmpli	cr7, r11, (PAGE_OFFSET + 0x1800000)@h
+#ifndef CONFIG_PIN_TLB_IMMR
+_ENTRY(DTLBMiss_jmp)
+	beq-	DTLBMissIMMR
+#endif
+	blt	cr7, DTLBMissLinear
+	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
+3:
+
+	/* Insert level 1 index */
+	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
+	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
+
+	/* We have a pte table, so load fetch the pte from the table.
+	 */
+	/* Extract level 2 index */
+	rlwinm	r10, r10, 32 - (PAGE_SHIFT - 2), 32 - PAGE_SHIFT, 29
+#ifdef CONFIG_HUGETLB_PAGE
+	mtcr	r11
+	bt-	28, 10f		/* bit 28 = Large page (8M) */
+	bt-	29, 20f		/* bit 29 = Large page (8M or 512k) */
+#endif
+	rlwimi	r10, r11, 0, 0, 32 - PAGE_SHIFT - 1	/* Add level 2 base */
+	lwz	r10, 0(r10)	/* Get the pte */
+4:
+	mtcr	r12
+
+	/* Insert the Guarded flag into the TWC from the Linux PTE.
+	 * It is bit 27 of both the Linux PTE and the TWC (at least
+	 * I got that right :-).  It will be better when we can put
+	 * this into the Linux pgd/pmd and load it in the operation
+	 * above.
+	 */
+	rlwimi	r11, r10, 0, _PAGE_GUARDED
+	mtspr	SPRN_MD_TWC, r11
+
+	/* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set.
+	 * We also need to know if the insn is a load/store, so:
+	 * Clear _PAGE_PRESENT and load that which will
+	 * trap into DTLB Error with store bit set accordinly.
+	 */
+	/* PRESENT=0x1, ACCESSED=0x20
+	 * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5));
+	 * r10 = (r10 & ~PRESENT) | r11;
+	 */
+	rlwinm	r11, r10, 32-7, _PAGE_PRESENT
+	and	r11, r11, r10
+	rlwimi	r10, r11, 0, _PAGE_PRESENT
+	/* The Linux PTE won't go exactly into the MMU TLB.
+	 * Software indicator bits 24, 25, 26, and 27 must be
+	 * set.  All other Linux PTE bits control the behavior
+	 * of the MMU.
+	 */
+	li	r11, RPN_PATTERN
+	rlwimi	r10, r11, 0, 24, 27	/* Set 24-27 */
+	mtspr	SPRN_MD_RPN, r10	/* Update TLB entry */
+
+	/* Restore registers */
+	mtspr	SPRN_DAR, r11	/* Tag DAR */
+_ENTRY(dtlb_miss_exit_1)
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+	rfi
+#ifdef CONFIG_PERF_EVENTS
+_ENTRY(dtlb_miss_perf)
+	lis	r10, (dtlb_miss_counter - PAGE_OFFSET)@ha
+	lwz	r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
+	addi	r11, r11, 1
+	stw	r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
+#endif
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+	rfi
+
+#ifdef CONFIG_HUGETLB_PAGE
+10:	/* 8M pages */
+	/* Extract level 2 index */
+#ifdef CONFIG_PPC_16K_PAGES
+	rlwinm	r10, r10, 32 - (PAGE_SHIFT_8M - PAGE_SHIFT), 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1), 29
+	/* Add level 2 base */
+	rlwimi	r10, r11, 0, 0, 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1) - 1
+#else
+	/* Level 2 base */
+	rlwinm	r10, r11, 0, ~HUGEPD_SHIFT_MASK
+#endif
+	lwz	r10, 0(r10)	/* Get the pte */
+	b	4b
+
+20:	/* 512k pages */
+	/* Extract level 2 index */
+	rlwinm	r10, r10, 32 - (PAGE_SHIFT_512K - PAGE_SHIFT), 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1), 29
+	/* Add level 2 base */
+	rlwimi	r10, r11, 0, 0, 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1) - 1
+	lwz	r10, 0(r10)	/* Get the pte */
+	b	4b
+#endif
+
+/* This is an instruction TLB error on the MPC8xx.  This could be due
+ * to many reasons, such as executing guarded memory or illegal instruction
+ * addresses.  There is nothing to do but handle a big time error fault.
+ */
+	. = 0x1300
+InstructionTLBError:
+	EXCEPTION_PROLOG
+	mr	r4,r12
+	andis.	r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
+	andis.	r10,r9,SRR1_ISI_NOPT@h
+	beq+	1f
+	tlbie	r4
+itlbie:
+	/* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
+1:	EXC_XFER_LITE(0x400, handle_page_fault)
+
+/* This is the data TLB error on the MPC8xx.  This could be due to
+ * many reasons, including a dirty update to a pte.  We bail out to
+ * a higher level function that can handle it.
+ */
+	. = 0x1400
+DataTLBError:
+	mtspr	SPRN_SPRG_SCRATCH0, r10
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+	mfcr	r10
+
+	mfspr	r11, SPRN_DAR
+	cmpwi	cr0, r11, RPN_PATTERN
+	beq-	FixupDAR	/* must be a buggy dcbX, icbi insn. */
+DARFixed:/* Return from dcbx instruction bug workaround */
+	EXCEPTION_PROLOG_1
+	EXCEPTION_PROLOG_2
+	mfspr	r5,SPRN_DSISR
+	stw	r5,_DSISR(r11)
+	mfspr	r4,SPRN_DAR
+	andis.	r10,r5,DSISR_NOHPTE@h
+	beq+	1f
+	tlbie	r4
+dtlbie:
+1:	li	r10,RPN_PATTERN
+	mtspr	SPRN_DAR,r10	/* Tag DAR, to be used in DTLB Error */
+	/* 0x300 is DataAccess exception, needed by bad_page_fault() */
+	EXC_XFER_LITE(0x300, handle_page_fault)
+
+	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_EE)
+
+/* On the MPC8xx, these next four traps are used for development
+ * support of breakpoints and such.  Someday I will get around to
+ * using them.
+ */
+	. = 0x1c00
+DataBreakpoint:
+	mtspr	SPRN_SPRG_SCRATCH0, r10
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+	mfcr	r10
+	mfspr	r11, SPRN_SRR0
+	cmplwi	cr0, r11, (dtlbie - PAGE_OFFSET)@l
+	cmplwi	cr7, r11, (itlbie - PAGE_OFFSET)@l
+	beq-	cr0, 11f
+	beq-	cr7, 11f
+	EXCEPTION_PROLOG_1
+	EXCEPTION_PROLOG_2
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	mfspr	r4,SPRN_BAR
+	stw	r4,_DAR(r11)
+	mfspr	r5,SPRN_DSISR
+	EXC_XFER_EE(0x1c00, do_break)
+11:
+	mtcr	r10
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	rfi
+
+#ifdef CONFIG_PERF_EVENTS
+	. = 0x1d00
+InstructionBreakpoint:
+	mtspr	SPRN_SPRG_SCRATCH0, r10
+	mtspr	SPRN_SPRG_SCRATCH1, r11
+	lis	r10, (instruction_counter - PAGE_OFFSET)@ha
+	lwz	r11, (instruction_counter - PAGE_OFFSET)@l(r10)
+	addi	r11, r11, -1
+	stw	r11, (instruction_counter - PAGE_OFFSET)@l(r10)
+	lis	r10, 0xffff
+	ori	r10, r10, 0x01
+	mtspr	SPRN_COUNTA, r10
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	rfi
+#else
+	EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE)
+#endif
+	EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_EE)
+
+	. = 0x2000
+
+/*
+ * Bottom part of DataStoreTLBMiss handlers for IMMR area and linear RAM.
+ * not enough space in the DataStoreTLBMiss area.
+ */
+DTLBMissIMMR:
+	mtcr	r12
+	/* Set 512k byte guarded page and mark it valid */
+	li	r10, MD_PS512K | MD_GUARDED | MD_SVALID
+	mtspr	SPRN_MD_TWC, r10
+	mfspr	r10, SPRN_IMMR			/* Get current IMMR */
+	rlwinm	r10, r10, 0, 0xfff80000		/* Get 512 kbytes boundary */
+	ori	r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \
+			  _PAGE_PRESENT | _PAGE_NO_CACHE
+	mtspr	SPRN_MD_RPN, r10	/* Update TLB entry */
+
+	li	r11, RPN_PATTERN
+	mtspr	SPRN_DAR, r11	/* Tag DAR */
+_ENTRY(dtlb_miss_exit_2)
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+	rfi
+
+DTLBMissLinear:
+	mtcr	r12
+	/* Set 8M byte page and mark it valid */
+	li	r11, MD_PS8MEG | MD_SVALID
+	mtspr	SPRN_MD_TWC, r11
+	rlwinm	r10, r10, 0, 0x0f800000	/* 8xx supports max 256Mb RAM */
+	ori	r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \
+			  _PAGE_PRESENT
+	mtspr	SPRN_MD_RPN, r10	/* Update TLB entry */
+
+	li	r11, RPN_PATTERN
+	mtspr	SPRN_DAR, r11	/* Tag DAR */
+_ENTRY(dtlb_miss_exit_3)
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+	rfi
+
+#ifndef CONFIG_PIN_TLB_TEXT
+ITLBMissLinear:
+	mtcr	r12
+	/* Set 8M byte page and mark it valid */
+	li	r11, MI_PS8MEG | MI_SVALID
+	mtspr	SPRN_MI_TWC, r11
+	rlwinm	r10, r10, 0, 0x0f800000	/* 8xx supports max 256Mb RAM */
+	ori	r10, r10, 0xf0 | MI_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \
+			  _PAGE_PRESENT
+	mtspr	SPRN_MI_RPN, r10	/* Update TLB entry */
+
+_ENTRY(itlb_miss_exit_2)
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r12, SPRN_SPRG_SCRATCH2
+	rfi
+#endif
+
+/* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions
+ * by decoding the registers used by the dcbx instruction and adding them.
+ * DAR is set to the calculated address.
+ */
+ /* define if you don't want to use self modifying code */
+#define NO_SELF_MODIFYING_CODE
+FixupDAR:/* Entry point for dcbx workaround. */
+	mtspr	SPRN_SPRG_SCRATCH2, r10
+	/* fetch instruction from memory. */
+	mfspr	r10, SPRN_SRR0
+	rlwinm	r11, r10, 16, 0xfff8
+	cmpli	cr0, r11, PAGE_OFFSET@h
+	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
+	blt+	3f
+	rlwinm	r11, r10, 16, 0xfff8
+_ENTRY(FixupDAR_cmp)
+	cmpli	cr7, r11, (PAGE_OFFSET + 0x1800000)@h
+	/* create physical page address from effective address */
+	tophys(r11, r10)
+	blt-	cr7, 201f
+	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
+	/* Insert level 1 index */
+3:	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
+	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
+	mtcr	r11
+	bt	28,200f		/* bit 28 = Large page (8M) */
+	bt	29,202f		/* bit 29 = Large page (8M or 512K) */
+	rlwinm	r11, r11,0,0,19	/* Extract page descriptor page address */
+	/* Insert level 2 index */
+	rlwimi	r11, r10, 32 - (PAGE_SHIFT - 2), 32 - PAGE_SHIFT, 29
+	lwz	r11, 0(r11)	/* Get the pte */
+	/* concat physical page address(r11) and page offset(r10) */
+	rlwimi	r11, r10, 0, 32 - PAGE_SHIFT, 31
+201:	lwz	r11,0(r11)
+/* Check if it really is a dcbx instruction. */
+/* dcbt and dcbtst does not generate DTLB Misses/Errors,
+ * no need to include them here */
+	xoris	r10, r11, 0x7c00	/* check if major OP code is 31 */
+	rlwinm	r10, r10, 0, 21, 5
+	cmpwi	cr0, r10, 2028	/* Is dcbz? */
+	beq+	142f
+	cmpwi	cr0, r10, 940	/* Is dcbi? */
+	beq+	142f
+	cmpwi	cr0, r10, 108	/* Is dcbst? */
+	beq+	144f		/* Fix up store bit! */
+	cmpwi	cr0, r10, 172	/* Is dcbf? */
+	beq+	142f
+	cmpwi	cr0, r10, 1964	/* Is icbi? */
+	beq+	142f
+141:	mfspr	r10,SPRN_SPRG_SCRATCH2
+	b	DARFixed	/* Nope, go back to normal TLB processing */
+
+	/* concat physical page address(r11) and page offset(r10) */
+200:
+#ifdef CONFIG_PPC_16K_PAGES
+	rlwinm	r11, r11, 0, 0, 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1) - 1
+	rlwimi	r11, r10, 32 - (PAGE_SHIFT_8M - 2), 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1), 29
+#else
+	rlwinm	r11, r10, 0, ~HUGEPD_SHIFT_MASK
+#endif
+	lwz	r11, 0(r11)	/* Get the pte */
+	/* concat physical page address(r11) and page offset(r10) */
+	rlwimi	r11, r10, 0, 32 - PAGE_SHIFT_8M, 31
+	b	201b
+
+202:
+	rlwinm	r11, r11, 0, 0, 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1) - 1
+	rlwimi	r11, r10, 32 - (PAGE_SHIFT_512K - 2), 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1), 29
+	lwz	r11, 0(r11)	/* Get the pte */
+	/* concat physical page address(r11) and page offset(r10) */
+	rlwimi	r11, r10, 0, 32 - PAGE_SHIFT_512K, 31
+	b	201b
+
+144:	mfspr	r10, SPRN_DSISR
+	rlwinm	r10, r10,0,7,5	/* Clear store bit for buggy dcbst insn */
+	mtspr	SPRN_DSISR, r10
+142:	/* continue, it was a dcbx, dcbi instruction. */
+#ifndef NO_SELF_MODIFYING_CODE
+	andis.	r10,r11,0x1f	/* test if reg RA is r0 */
+	li	r10,modified_instr@l
+	dcbtst	r0,r10		/* touch for store */
+	rlwinm	r11,r11,0,0,20	/* Zero lower 10 bits */
+	oris	r11,r11,640	/* Transform instr. to a "add r10,RA,RB" */
+	ori	r11,r11,532
+	stw	r11,0(r10)	/* store add/and instruction */
+	dcbf	0,r10		/* flush new instr. to memory. */
+	icbi	0,r10		/* invalidate instr. cache line */
+	mfspr	r11, SPRN_SPRG_SCRATCH1	/* restore r11 */
+	mfspr	r10, SPRN_SPRG_SCRATCH0	/* restore r10 */
+	isync			/* Wait until new instr is loaded from memory */
+modified_instr:
+	.space	4		/* this is where the add instr. is stored */
+	bne+	143f
+	subf	r10,r0,r10	/* r10=r10-r0, only if reg RA is r0 */
+143:	mtdar	r10		/* store faulting EA in DAR */
+	mfspr	r10,SPRN_SPRG_SCRATCH2
+	b	DARFixed	/* Go back to normal TLB handling */
+#else
+	mfctr	r10
+	mtdar	r10			/* save ctr reg in DAR */
+	rlwinm	r10, r11, 24, 24, 28	/* offset into jump table for reg RB */
+	addi	r10, r10, 150f@l	/* add start of table */
+	mtctr	r10			/* load ctr with jump address */
+	xor	r10, r10, r10		/* sum starts at zero */
+	bctr				/* jump into table */
+150:
+	add	r10, r10, r0	;b	151f
+	add	r10, r10, r1	;b	151f
+	add	r10, r10, r2	;b	151f
+	add	r10, r10, r3	;b	151f
+	add	r10, r10, r4	;b	151f
+	add	r10, r10, r5	;b	151f
+	add	r10, r10, r6	;b	151f
+	add	r10, r10, r7	;b	151f
+	add	r10, r10, r8	;b	151f
+	add	r10, r10, r9	;b	151f
+	mtctr	r11	;b	154f	/* r10 needs special handling */
+	mtctr	r11	;b	153f	/* r11 needs special handling */
+	add	r10, r10, r12	;b	151f
+	add	r10, r10, r13	;b	151f
+	add	r10, r10, r14	;b	151f
+	add	r10, r10, r15	;b	151f
+	add	r10, r10, r16	;b	151f
+	add	r10, r10, r17	;b	151f
+	add	r10, r10, r18	;b	151f
+	add	r10, r10, r19	;b	151f
+	add	r10, r10, r20	;b	151f
+	add	r10, r10, r21	;b	151f
+	add	r10, r10, r22	;b	151f
+	add	r10, r10, r23	;b	151f
+	add	r10, r10, r24	;b	151f
+	add	r10, r10, r25	;b	151f
+	add	r10, r10, r26	;b	151f
+	add	r10, r10, r27	;b	151f
+	add	r10, r10, r28	;b	151f
+	add	r10, r10, r29	;b	151f
+	add	r10, r10, r30	;b	151f
+	add	r10, r10, r31
+151:
+	rlwinm. r11,r11,19,24,28	/* offset into jump table for reg RA */
+	beq	152f			/* if reg RA is zero, don't add it */
+	addi	r11, r11, 150b@l	/* add start of table */
+	mtctr	r11			/* load ctr with jump address */
+	rlwinm	r11,r11,0,16,10		/* make sure we don't execute this more than once */
+	bctr				/* jump into table */
+152:
+	mfdar	r11
+	mtctr	r11			/* restore ctr reg from DAR */
+	mtdar	r10			/* save fault EA to DAR */
+	mfspr	r10,SPRN_SPRG_SCRATCH2
+	b	DARFixed		/* Go back to normal TLB handling */
+
+	/* special handling for r10,r11 since these are modified already */
+153:	mfspr	r11, SPRN_SPRG_SCRATCH1	/* load r11 from SPRN_SPRG_SCRATCH1 */
+	add	r10, r10, r11	/* add it */
+	mfctr	r11		/* restore r11 */
+	b	151b
+154:	mfspr	r11, SPRN_SPRG_SCRATCH0	/* load r10 from SPRN_SPRG_SCRATCH0 */
+	add	r10, r10, r11	/* add it */
+	mfctr	r11		/* restore r11 */
+	b	151b
+#endif
+
+/*
+ * This is where the main kernel code starts.
+ */
+start_here:
+	/* ptr to current */
+	lis	r2,init_task@h
+	ori	r2,r2,init_task@l
+
+	/* ptr to phys current thread */
+	tophys(r4,r2)
+	addi	r4,r4,THREAD	/* init task's THREAD */
+	mtspr	SPRN_SPRG_THREAD,r4
+
+	/* stack */
+	lis	r1,init_thread_union@ha
+	addi	r1,r1,init_thread_union@l
+	li	r0,0
+	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
+
+	lis	r6, swapper_pg_dir@ha
+	tophys(r6,r6)
+	mtspr	SPRN_M_TW, r6
+
+	bl	early_init	/* We have to do this with MMU on */
+
+/*
+ * Decide what sort of machine this is and initialize the MMU.
+ */
+	li	r3,0
+	mr	r4,r31
+	bl	machine_init
+	bl	MMU_init
+
+/*
+ * Go back to running unmapped so we can load up new values
+ * and change to using our exception vectors.
+ * On the 8xx, all we have to do is invalidate the TLB to clear
+ * the old 8M byte TLB mappings and load the page table base register.
+ */
+	/* The right way to do this would be to track it down through
+	 * init's THREAD like the context switch code does, but this is
+	 * easier......until someone changes init's static structures.
+	 */
+	lis	r4,2f@h
+	ori	r4,r4,2f@l
+	tophys(r4,r4)
+	li	r3,MSR_KERNEL & ~(MSR_IR|MSR_DR)
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	rfi
+/* Load up the kernel context */
+2:
+	tlbia			/* Clear all TLB entries */
+	sync			/* wait for tlbia/tlbie to finish */
+
+	/* set up the PTE pointers for the Abatron bdiGDB.
+	*/
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r5, 0xf0(0)	/* Must match your Abatron config file */
+	tophys(r5,r5)
+	lis	r6, swapper_pg_dir@h
+	ori	r6, r6, swapper_pg_dir@l
+	stw	r6, 0(r5)
+
+/* Now turn on the MMU for real! */
+	li	r4,MSR_KERNEL
+	lis	r3,start_kernel@h
+	ori	r3,r3,start_kernel@l
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	rfi			/* enable MMU and jump to start_kernel */
+
+/* Set up the initial MMU state so we can do the first level of
+ * kernel initialization.  This maps the first 8 MBytes of memory 1:1
+ * virtual to physical.  Also, set the cache mode since that is defined
+ * by TLB entries and perform any additional mapping (like of the IMMR).
+ * If configured to pin some TLBs, we pin the first 8 Mbytes of kernel,
+ * 24 Mbytes of data, and the 512k IMMR space.  Anything not covered by
+ * these mappings is mapped by page tables.
+ */
+initial_mmu:
+	li	r8, 0
+	mtspr	SPRN_MI_CTR, r8		/* remove PINNED ITLB entries */
+	lis	r10, MD_RESETVAL@h
+#ifndef CONFIG_8xx_COPYBACK
+	oris	r10, r10, MD_WTDEF@h
+#endif
+	mtspr	SPRN_MD_CTR, r10	/* remove PINNED DTLB entries */
+
+	tlbia			/* Invalidate all TLB entries */
+#ifdef CONFIG_PIN_TLB_TEXT
+	lis	r8, MI_RSV4I@h
+	ori	r8, r8, 0x1c00
+
+	mtspr	SPRN_MI_CTR, r8	/* Set instruction MMU control */
+#endif
+
+#ifdef CONFIG_PIN_TLB_DATA
+	oris	r10, r10, MD_RSV4I@h
+	mtspr	SPRN_MD_CTR, r10	/* Set data TLB control */
+#endif
+
+	/* Now map the lower 8 Meg into the ITLB. */
+	lis	r8, KERNELBASE@h	/* Create vaddr for TLB */
+	ori	r8, r8, MI_EVALID	/* Mark it valid */
+	mtspr	SPRN_MI_EPN, r8
+	li	r8, MI_PS8MEG /* Set 8M byte page */
+	ori	r8, r8, MI_SVALID	/* Make it valid */
+	mtspr	SPRN_MI_TWC, r8
+	li	r8, MI_BOOTINIT		/* Create RPN for address 0 */
+	mtspr	SPRN_MI_RPN, r8		/* Store TLB entry */
+
+	lis	r8, MI_APG_INIT@h	/* Set protection modes */
+	ori	r8, r8, MI_APG_INIT@l
+	mtspr	SPRN_MI_AP, r8
+	lis	r8, MD_APG_INIT@h
+	ori	r8, r8, MD_APG_INIT@l
+	mtspr	SPRN_MD_AP, r8
+
+	/* Map a 512k page for the IMMR to get the processor
+	 * internal registers (among other things).
+	 */
+#ifdef CONFIG_PIN_TLB_IMMR
+	oris	r10, r10, MD_RSV4I@h
+	ori	r10, r10, 0x1c00
+	mtspr	SPRN_MD_CTR, r10
+
+	mfspr	r9, 638			/* Get current IMMR */
+	andis.	r9, r9, 0xfff8		/* Get 512 kbytes boundary */
+
+	lis	r8, VIRT_IMMR_BASE@h	/* Create vaddr for TLB */
+	ori	r8, r8, MD_EVALID	/* Mark it valid */
+	mtspr	SPRN_MD_EPN, r8
+	li	r8, MD_PS512K | MD_GUARDED	/* Set 512k byte page */
+	ori	r8, r8, MD_SVALID	/* Make it valid */
+	mtspr	SPRN_MD_TWC, r8
+	mr	r8, r9			/* Create paddr for TLB */
+	ori	r8, r8, MI_BOOTINIT|0x2 /* Inhibit cache -- Cort */
+	mtspr	SPRN_MD_RPN, r8
+#endif
+
+	/* Since the cache is enabled according to the information we
+	 * just loaded into the TLB, invalidate and enable the caches here.
+	 * We should probably check/set other modes....later.
+	 */
+	lis	r8, IDC_INVALL@h
+	mtspr	SPRN_IC_CST, r8
+	mtspr	SPRN_DC_CST, r8
+	lis	r8, IDC_ENABLE@h
+	mtspr	SPRN_IC_CST, r8
+#ifdef CONFIG_8xx_COPYBACK
+	mtspr	SPRN_DC_CST, r8
+#else
+	/* For a debug option, I left this here to easily enable
+	 * the write through cache mode
+	 */
+	lis	r8, DC_SFWT@h
+	mtspr	SPRN_DC_CST, r8
+	lis	r8, IDC_ENABLE@h
+	mtspr	SPRN_DC_CST, r8
+#endif
+	/* Disable debug mode entry on breakpoints */
+	mfspr	r8, SPRN_DER
+#ifdef CONFIG_PERF_EVENTS
+	rlwinm	r8, r8, 0, ~0xc
+#else
+	rlwinm	r8, r8, 0, ~0x8
+#endif
+	mtspr	SPRN_DER, r8
+	blr
+
+
+/*
+ * We put a few things here that have to be page-aligned.
+ * This stuff goes at the beginning of the data segment,
+ * which is page-aligned.
+ */
+	.data
+	.globl	sdata
+sdata:
+	.globl	empty_zero_page
+	.align	PAGE_SHIFT
+empty_zero_page:
+	.space	PAGE_SIZE
+EXPORT_SYMBOL(empty_zero_page)
+
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PGD_TABLE_SIZE
+
+/* Room for two PTE table poiners, usually the kernel and current user
+ * pointer to their respective root page table (pgdir).
+ */
+abatron_pteptrs:
+	.space	8
+
+#ifdef CONFIG_PERF_EVENTS
+	.globl	itlb_miss_counter
+itlb_miss_counter:
+	.space	4
+
+	.globl	dtlb_miss_counter
+dtlb_miss_counter:
+	.space	4
+
+	.globl	instruction_counter
+instruction_counter:
+	.space	4
+#endif
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
new file mode 100644
index 000000000..306e26c07
--- /dev/null
+++ b/arch/powerpc/kernel/head_booke.h
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __HEAD_BOOKE_H__
+#define __HEAD_BOOKE_H__
+
+#include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+
+/*
+ * Macros used for common Book-e exception handling
+ */
+
+#define SET_IVOR(vector_number, vector_label)		\
+		li	r26,vector_label@l; 		\
+		mtspr	SPRN_IVOR##vector_number,r26;	\
+		sync
+
+#if (THREAD_SHIFT < 15)
+#define ALLOC_STACK_FRAME(reg, val)			\
+	addi reg,reg,val
+#else
+#define ALLOC_STACK_FRAME(reg, val)			\
+	addis	reg,reg,val@ha;				\
+	addi	reg,reg,val@l
+#endif
+
+/*
+ * Macro used to get to thread save registers.
+ * Note that entries 0-3 are used for the prolog code, and the remaining
+ * entries are available for specific exception use in the event a handler
+ * requires more than 4 scratch registers.
+ */
+#define THREAD_NORMSAVE(offset)	(THREAD_NORMSAVES + (offset * 4))
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define BOOKE_CLEAR_BTB(reg)									\
+START_BTB_FLUSH_SECTION								\
+	BTB_FLUSH(reg)									\
+END_BTB_FLUSH_SECTION
+#else
+#define BOOKE_CLEAR_BTB(reg)
+#endif
+
+
+#define NORMAL_EXCEPTION_PROLOG(intno)						     \
+	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
+	mfspr	r10, SPRN_SPRG_THREAD;					     \
+	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
+	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
+	mfcr	r13;			/* save CR in r13 for now	   */\
+	mfspr	r11, SPRN_SRR1;		                                     \
+	DO_KVM	BOOKE_INTERRUPT_##intno SPRN_SRR1;			     \
+	andi.	r11, r11, MSR_PR;	/* check whether user or kernel    */\
+	mr	r11, r1;						     \
+	beq	1f;							     \
+	BOOKE_CLEAR_BTB(r11)						\
+	/* if from user, start at top of this thread's kernel stack */       \
+	lwz	r11, THREAD_INFO-THREAD(r10);				     \
+	ALLOC_STACK_FRAME(r11, THREAD_SIZE);				     \
+1 :	subi	r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */     \
+	stw	r13, _CCR(r11);		/* save various registers */	     \
+	stw	r12,GPR12(r11);						     \
+	stw	r9,GPR9(r11);						     \
+	mfspr	r13, SPRN_SPRG_RSCRATCH0;				     \
+	stw	r13, GPR10(r11);					     \
+	lwz	r12, THREAD_NORMSAVE(0)(r10);				     \
+	stw	r12,GPR11(r11);						     \
+	lwz	r13, THREAD_NORMSAVE(2)(r10); /* restore r13 */		     \
+	mflr	r10;							     \
+	stw	r10,_LINK(r11);						     \
+	mfspr	r12,SPRN_SRR0;						     \
+	stw	r1, GPR1(r11);						     \
+	mfspr	r9,SPRN_SRR1;						     \
+	stw	r1, 0(r11);						     \
+	mr	r1, r11;						     \
+	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
+	stw	r0,GPR0(r11);						     \
+	lis	r10, STACK_FRAME_REGS_MARKER@ha;/* exception frame marker */ \
+	addi	r10, r10, STACK_FRAME_REGS_MARKER@l;			     \
+	stw	r10, 8(r11);						     \
+	SAVE_4GPRS(3, r11);						     \
+	SAVE_2GPRS(7, r11)
+
+/* To handle the additional exception priority levels on 40x and Book-E
+ * processors we allocate a stack per additional priority level.
+ *
+ * On 40x critical is the only additional level
+ * On 44x/e500 we have critical and machine check
+ * On e200 we have critical and debug (machine check occurs via critical)
+ *
+ * Additionally we reserve a SPRG for each priority level so we can free up a
+ * GPR to use as the base for indirect access to the exception stacks.  This
+ * is necessary since the MMU is always on, for Book-E parts, and the stacks
+ * are offset from KERNELBASE.
+ *
+ * There is some space optimization to be had here if desired.  However
+ * to allow for a common kernel with support for debug exceptions either
+ * going to critical or their own debug level we aren't currently
+ * providing configurations that micro-optimize space usage.
+ */
+
+#define MC_STACK_BASE		mcheckirq_ctx
+#define CRIT_STACK_BASE		critirq_ctx
+
+/* only on e500mc/e200 */
+#define DBG_STACK_BASE		dbgirq_ctx
+
+#define EXC_LVL_FRAME_OVERHEAD	(THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE)
+
+#ifdef CONFIG_SMP
+#define BOOKE_LOAD_EXC_LEVEL_STACK(level)		\
+	mfspr	r8,SPRN_PIR;				\
+	slwi	r8,r8,2;				\
+	addis	r8,r8,level##_STACK_BASE@ha;		\
+	lwz	r8,level##_STACK_BASE@l(r8);		\
+	addi	r8,r8,EXC_LVL_FRAME_OVERHEAD;
+#else
+#define BOOKE_LOAD_EXC_LEVEL_STACK(level)		\
+	lis	r8,level##_STACK_BASE@ha;		\
+	lwz	r8,level##_STACK_BASE@l(r8);		\
+	addi	r8,r8,EXC_LVL_FRAME_OVERHEAD;
+#endif
+
+/*
+ * Exception prolog for critical/machine check exceptions.  This is a
+ * little different from the normal exception prolog above since a
+ * critical/machine check exception can potentially occur at any point
+ * during normal exception processing. Thus we cannot use the same SPRG
+ * registers as the normal prolog above. Instead we use a portion of the
+ * critical/machine check exception stack at low physical addresses.
+ */
+#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \
+	mtspr	SPRN_SPRG_WSCRATCH_##exc_level,r8;			     \
+	BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \
+	stw	r9,GPR9(r8);		/* save various registers	   */\
+	mfcr	r9;			/* save CR in r9 for now	   */\
+	stw	r10,GPR10(r8);						     \
+	stw	r11,GPR11(r8);						     \
+	stw	r9,_CCR(r8);		/* save CR on stack		   */\
+	mfspr	r11,exc_level_srr1;	/* check whether user or kernel    */\
+	DO_KVM	BOOKE_INTERRUPT_##intno exc_level_srr1;		             \
+	BOOKE_CLEAR_BTB(r10)						\
+	andi.	r11,r11,MSR_PR;						     \
+	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
+	lwz	r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
+	addi	r11,r11,EXC_LVL_FRAME_OVERHEAD;	/* allocate stack frame    */\
+	beq	1f;							     \
+	/* COMING FROM USER MODE */					     \
+	stw	r9,_CCR(r11);		/* save CR			   */\
+	lwz	r10,GPR10(r8);		/* copy regs from exception stack  */\
+	lwz	r9,GPR9(r8);						     \
+	stw	r10,GPR10(r11);						     \
+	lwz	r10,GPR11(r8);						     \
+	stw	r9,GPR9(r11);						     \
+	stw	r10,GPR11(r11);						     \
+	b	2f;							     \
+	/* COMING FROM PRIV MODE */					     \
+1:	lwz	r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11);		     \
+	lwz	r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11);		     \
+	stw	r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8);			     \
+	stw	r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8);		     \
+	lwz	r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11);			     \
+	stw	r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8);			     \
+	mr	r11,r8;							     \
+2:	mfspr	r8,SPRN_SPRG_RSCRATCH_##exc_level;			     \
+	stw	r12,GPR12(r11);		/* save various registers	   */\
+	mflr	r10;							     \
+	stw	r10,_LINK(r11);						     \
+	mfspr	r12,SPRN_DEAR;		/* save DEAR and ESR in the frame  */\
+	stw	r12,_DEAR(r11);		/* since they may have had stuff   */\
+	mfspr	r9,SPRN_ESR;		/* in them at the point where the  */\
+	stw	r9,_ESR(r11);		/* exception was taken		   */\
+	mfspr	r12,exc_level_srr0;					     \
+	stw	r1,GPR1(r11);						     \
+	mfspr	r9,exc_level_srr1;					     \
+	stw	r1,0(r11);						     \
+	mr	r1,r11;							     \
+	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
+	stw	r0,GPR0(r11);						     \
+	SAVE_4GPRS(3, r11);						     \
+	SAVE_2GPRS(7, r11)
+
+#define CRITICAL_EXCEPTION_PROLOG(intno) \
+		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1)
+#define DEBUG_EXCEPTION_PROLOG \
+		EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1)
+#define MCHECK_EXCEPTION_PROLOG \
+		EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \
+			SPRN_MCSRR0, SPRN_MCSRR1)
+
+/*
+ * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite
+ * being delivered to the host.  This exception can only happen
+ * inside a KVM guest -- so we just handle up to the DO_KVM rather
+ * than try to fit this into one of the existing prolog macros.
+ */
+#define GUEST_DOORBELL_EXCEPTION \
+	START_EXCEPTION(GuestDoorbell);					     \
+	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
+	mfspr	r10, SPRN_SPRG_THREAD;					     \
+	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
+	mfcr	r13;			/* save CR in r13 for now	   */\
+	DO_KVM	BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1;			     \
+	trap
+
+/*
+ * Exception vectors.
+ */
+#define	START_EXCEPTION(label)						     \
+        .align 5;              						     \
+label:
+
+#define EXCEPTION(n, intno, label, hdlr, xfer)			\
+	START_EXCEPTION(label);					\
+	NORMAL_EXCEPTION_PROLOG(intno);				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+	xfer(n, hdlr)
+
+#define CRITICAL_EXCEPTION(n, intno, label, hdlr)			\
+	START_EXCEPTION(label);						\
+	CRITICAL_EXCEPTION_PROLOG(intno);				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
+	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
+			  NOCOPY, crit_transfer_to_handler, \
+			  ret_from_crit_exc)
+
+#define MCHECK_EXCEPTION(n, label, hdlr)			\
+	START_EXCEPTION(label);					\
+	MCHECK_EXCEPTION_PROLOG;				\
+	mfspr	r5,SPRN_ESR;					\
+	stw	r5,_ESR(r11);					\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+	EXC_XFER_TEMPLATE(hdlr, n+4, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
+			  NOCOPY, mcheck_transfer_to_handler,   \
+			  ret_from_mcheck_exc)
+
+#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
+	li	r10,trap;					\
+	stw	r10,_TRAP(r11);					\
+	lis	r10,msr@h;					\
+	ori	r10,r10,msr@l;					\
+	copyee(r10, r9);					\
+	bl	tfer;		 				\
+	.long	hdlr;						\
+	.long	ret
+
+#define COPY_EE(d, s)		rlwimi d,s,0,16,16
+#define NOCOPY(d, s)
+
+#define EXC_XFER_STD(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_LITE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
+#define EXC_XFER_EE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_EE_LITE(n, hdlr)	\
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \
+			  ret_from_except)
+
+/* Check for a single step debug exception while in an exception
+ * handler before state has been saved.  This is to catch the case
+ * where an instruction that we are trying to single step causes
+ * an exception (eg ITLB/DTLB miss) and thus the first instruction of
+ * the exception handler generates a single step debug exception.
+ *
+ * If we get a debug trap on the first instruction of an exception handler,
+ * we reset the MSR_DE in the _exception handler's_ MSR (the debug trap is
+ * a critical exception, so we are using SPRN_CSRR1 to manipulate the MSR).
+ * The exception handler was handling a non-critical interrupt, so it will
+ * save (and later restore) the MSR via SPRN_CSRR1, which will still have
+ * the MSR_DE bit set.
+ */
+#define DEBUG_DEBUG_EXCEPTION						      \
+	START_EXCEPTION(DebugDebug);					      \
+	DEBUG_EXCEPTION_PROLOG;						      \
+									      \
+	/*								      \
+	 * If there is a single step or branch-taken exception in an	      \
+	 * exception entry sequence, it was probably meant to apply to	      \
+	 * the code where the exception occurred (since exception entry	      \
+	 * doesn't turn off DE automatically).  We simulate the effect	      \
+	 * of turning off DE on entry to an exception handler by turning      \
+	 * off DE in the DSRR1 value and clearing the debug status.	      \
+	 */								      \
+	mfspr	r10,SPRN_DBSR;		/* check single-step/branch taken */  \
+	andis.	r10,r10,(DBSR_IC|DBSR_BT)@h;				      \
+	beq+	2f;							      \
+									      \
+	lis	r10,interrupt_base@h;	/* check if exception in vectors */   \
+	ori	r10,r10,interrupt_base@l;				      \
+	cmplw	r12,r10;						      \
+	blt+	2f;			/* addr below exception vectors */    \
+									      \
+	lis	r10,interrupt_end@h;					      \
+	ori	r10,r10,interrupt_end@l;				      \
+	cmplw	r12,r10;						      \
+	bgt+	2f;			/* addr above exception vectors */    \
+									      \
+	/* here it looks like we got an inappropriate debug exception. */     \
+1:	rlwinm	r9,r9,0,~MSR_DE;	/* clear DE in the CDRR1 value */     \
+	lis	r10,(DBSR_IC|DBSR_BT)@h;	/* clear the IC event */      \
+	mtspr	SPRN_DBSR,r10;						      \
+	/* restore state and get out */					      \
+	lwz	r10,_CCR(r11);						      \
+	lwz	r0,GPR0(r11);						      \
+	lwz	r1,GPR1(r11);						      \
+	mtcrf	0x80,r10;						      \
+	mtspr	SPRN_DSRR0,r12;						      \
+	mtspr	SPRN_DSRR1,r9;						      \
+	lwz	r9,GPR9(r11);						      \
+	lwz	r12,GPR12(r11);						      \
+	mtspr	SPRN_SPRG_WSCRATCH_DBG,r8;				      \
+	BOOKE_LOAD_EXC_LEVEL_STACK(DBG); /* r8 points to the debug stack */ \
+	lwz	r10,GPR10(r8);						      \
+	lwz	r11,GPR11(r8);						      \
+	mfspr	r8,SPRN_SPRG_RSCRATCH_DBG;				      \
+									      \
+	PPC_RFDI;							      \
+	b	.;							      \
+									      \
+	/* continue normal handling for a debug exception... */		      \
+2:	mfspr	r4,SPRN_DBSR;						      \
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
+	EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, debug_transfer_to_handler, ret_from_debug_exc)
+
+#define DEBUG_CRIT_EXCEPTION						      \
+	START_EXCEPTION(DebugCrit);					      \
+	CRITICAL_EXCEPTION_PROLOG(DEBUG);				      \
+									      \
+	/*								      \
+	 * If there is a single step or branch-taken exception in an	      \
+	 * exception entry sequence, it was probably meant to apply to	      \
+	 * the code where the exception occurred (since exception entry	      \
+	 * doesn't turn off DE automatically).  We simulate the effect	      \
+	 * of turning off DE on entry to an exception handler by turning      \
+	 * off DE in the CSRR1 value and clearing the debug status.	      \
+	 */								      \
+	mfspr	r10,SPRN_DBSR;		/* check single-step/branch taken */  \
+	andis.	r10,r10,(DBSR_IC|DBSR_BT)@h;				      \
+	beq+	2f;							      \
+									      \
+	lis	r10,interrupt_base@h;	/* check if exception in vectors */   \
+	ori	r10,r10,interrupt_base@l;				      \
+	cmplw	r12,r10;						      \
+	blt+	2f;			/* addr below exception vectors */    \
+									      \
+	lis	r10,interrupt_end@h;					      \
+	ori	r10,r10,interrupt_end@l;				      \
+	cmplw	r12,r10;						      \
+	bgt+	2f;			/* addr above exception vectors */    \
+									      \
+	/* here it looks like we got an inappropriate debug exception. */     \
+1:	rlwinm	r9,r9,0,~MSR_DE;	/* clear DE in the CSRR1 value */     \
+	lis	r10,(DBSR_IC|DBSR_BT)@h;	/* clear the IC event */      \
+	mtspr	SPRN_DBSR,r10;						      \
+	/* restore state and get out */					      \
+	lwz	r10,_CCR(r11);						      \
+	lwz	r0,GPR0(r11);						      \
+	lwz	r1,GPR1(r11);						      \
+	mtcrf	0x80,r10;						      \
+	mtspr	SPRN_CSRR0,r12;						      \
+	mtspr	SPRN_CSRR1,r9;						      \
+	lwz	r9,GPR9(r11);						      \
+	lwz	r12,GPR12(r11);						      \
+	mtspr	SPRN_SPRG_WSCRATCH_CRIT,r8;				      \
+	BOOKE_LOAD_EXC_LEVEL_STACK(CRIT); /* r8 points to the debug stack */  \
+	lwz	r10,GPR10(r8);						      \
+	lwz	r11,GPR11(r8);						      \
+	mfspr	r8,SPRN_SPRG_RSCRATCH_CRIT;				      \
+									      \
+	rfci;								      \
+	b	.;							      \
+									      \
+	/* continue normal handling for a critical exception... */	      \
+2:	mfspr	r4,SPRN_DBSR;						      \
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
+	EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
+
+#define DATA_STORAGE_EXCEPTION						      \
+	START_EXCEPTION(DataStorage)					      \
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE);		      \
+	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
+	stw	r5,_ESR(r11);						      \
+	mfspr	r4,SPRN_DEAR;		/* Grab the DEAR */		      \
+	EXC_XFER_LITE(0x0300, handle_page_fault)
+
+#define INSTRUCTION_STORAGE_EXCEPTION					      \
+	START_EXCEPTION(InstructionStorage)				      \
+	NORMAL_EXCEPTION_PROLOG(INST_STORAGE);		      \
+	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
+	stw	r5,_ESR(r11);						      \
+	mr      r4,r12;                 /* Pass SRR0 as arg2 */		      \
+	li      r5,0;                   /* Pass zero as arg3 */		      \
+	EXC_XFER_LITE(0x0400, handle_page_fault)
+
+#define ALIGNMENT_EXCEPTION						      \
+	START_EXCEPTION(Alignment)					      \
+	NORMAL_EXCEPTION_PROLOG(ALIGNMENT);		      \
+	mfspr   r4,SPRN_DEAR;           /* Grab the DEAR and save it */	      \
+	stw     r4,_DEAR(r11);						      \
+	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
+	EXC_XFER_EE(0x0600, alignment_exception)
+
+#define PROGRAM_EXCEPTION						      \
+	START_EXCEPTION(Program)					      \
+	NORMAL_EXCEPTION_PROLOG(PROGRAM);		      \
+	mfspr	r4,SPRN_ESR;		/* Grab the ESR and save it */	      \
+	stw	r4,_ESR(r11);						      \
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
+	EXC_XFER_STD(0x0700, program_check_exception)
+
+#define DECREMENTER_EXCEPTION						      \
+	START_EXCEPTION(Decrementer)					      \
+	NORMAL_EXCEPTION_PROLOG(DECREMENTER);		      \
+	lis     r0,TSR_DIS@h;           /* Setup the DEC interrupt mask */    \
+	mtspr   SPRN_TSR,r0;		/* Clear the DEC interrupt */	      \
+	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
+	EXC_XFER_LITE(0x0900, timer_interrupt)
+
+#define FP_UNAVAILABLE_EXCEPTION					      \
+	START_EXCEPTION(FloatingPointUnavailable)			      \
+	NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL);		      \
+	beq	1f;							      \
+	bl	load_up_fpu;		/* if from user, just load it up */   \
+	b	fast_exception_return;					      \
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
+	EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception)
+
+#ifndef __ASSEMBLY__
+struct exception_regs {
+	unsigned long mas0;
+	unsigned long mas1;
+	unsigned long mas2;
+	unsigned long mas3;
+	unsigned long mas6;
+	unsigned long mas7;
+	unsigned long srr0;
+	unsigned long srr1;
+	unsigned long csrr0;
+	unsigned long csrr1;
+	unsigned long dsrr0;
+	unsigned long dsrr1;
+	unsigned long saved_ksp_limit;
+};
+
+/* ensure this structure is always sized to a multiple of the stack alignment */
+#define STACK_EXC_LVL_FRAME_SIZE	_ALIGN_UP(sizeof (struct exception_regs), 16)
+
+#endif /* __ASSEMBLY__ */
+#endif /* __HEAD_BOOKE_H__ */
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
new file mode 100644
index 000000000..2386ce2a9
--- /dev/null
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -0,0 +1,1253 @@
+/*
+ * Kernel execution entry point code.
+ *
+ *    Copyright (c) 1995-1996 Gary Thomas <gdt@linuxppc.org>
+ *	Initial PowerPC version.
+ *    Copyright (c) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *	Rewritten for PReP
+ *    Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au>
+ *	Low-level exception handers, MMU support, and rewrite.
+ *    Copyright (c) 1997 Dan Malek <dmalek@jlc.net>
+ *	PowerPC 8xx modifications.
+ *    Copyright (c) 1998-1999 TiVo, Inc.
+ *	PowerPC 403GCX modifications.
+ *    Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu>
+ *	PowerPC 403GCX/405GP modifications.
+ *    Copyright 2000 MontaVista Software Inc.
+ *	PPC405 modifications
+ *	PowerPC 403GCX/405GP modifications.
+ *	Author: MontaVista Software, Inc.
+ *		frank_rowand@mvista.com or source@mvista.com
+ *		debbie_chu@mvista.com
+ *    Copyright 2002-2004 MontaVista Software, Inc.
+ *	PowerPC 44x support, Matt Porter <mporter@kernel.crashing.org>
+ *    Copyright 2004 Freescale Semiconductor, Inc
+ *	PowerPC e500 modifications, Kumar Gala <galak@kernel.crashing.org>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+#include "head_booke.h"
+
+/* As with the other PowerPC ports, it is expected that when code
+ * execution begins here, the following registers contain valid, yet
+ * optional, information:
+ *
+ *   r3 - Board info structure pointer (DRAM, frequency, MAC address, etc.)
+ *   r4 - Starting address of the init RAM disk
+ *   r5 - Ending address of the init RAM disk
+ *   r6 - Start of kernel command line string (e.g. "mem=128")
+ *   r7 - End of kernel command line string
+ *
+ */
+	__HEAD
+_ENTRY(_stext);
+_ENTRY(_start);
+	/*
+	 * Reserve a word at a fixed location to store the address
+	 * of abatron_pteptrs
+	 */
+	nop
+
+	/* Translate device tree address to physical, save in r30/r31 */
+	bl	get_phys_addr
+	mr	r30,r3
+	mr	r31,r4
+
+	li	r25,0			/* phys kernel start (low) */
+	li	r24,0			/* CPU number */
+	li	r23,0			/* phys kernel start (high) */
+
+#ifdef CONFIG_RELOCATABLE
+	LOAD_REG_ADDR_PIC(r3, _stext)	/* Get our current runtime base */
+
+	/* Translate _stext address to physical, save in r23/r25 */
+	bl	get_phys_addr
+	mr	r23,r3
+	mr	r25,r4
+
+	bl	0f
+0:	mflr	r8
+	addis	r3,r8,(is_second_reloc - 0b)@ha
+	lwz	r19,(is_second_reloc - 0b)@l(r3)
+
+	/* Check if this is the second relocation. */
+	cmpwi	r19,1
+	bne	1f
+
+	/*
+	 * For the second relocation, we already get the real memstart_addr
+	 * from device tree. So we will map PAGE_OFFSET to memstart_addr,
+	 * then the virtual address of start kernel should be:
+	 *          PAGE_OFFSET + (kernstart_addr - memstart_addr)
+	 * Since the offset between kernstart_addr and memstart_addr should
+	 * never be beyond 1G, so we can just use the lower 32bit of them
+	 * for the calculation.
+	 */
+	lis	r3,PAGE_OFFSET@h
+
+	addis	r4,r8,(kernstart_addr - 0b)@ha
+	addi	r4,r4,(kernstart_addr - 0b)@l
+	lwz	r5,4(r4)
+
+	addis	r6,r8,(memstart_addr - 0b)@ha
+	addi	r6,r6,(memstart_addr - 0b)@l
+	lwz	r7,4(r6)
+
+	subf	r5,r7,r5
+	add	r3,r3,r5
+	b	2f
+
+1:
+	/*
+	 * We have the runtime (virutal) address of our base.
+	 * We calculate our shift of offset from a 64M page.
+	 * We could map the 64M page we belong to at PAGE_OFFSET and
+	 * get going from there.
+	 */
+	lis	r4,KERNELBASE@h
+	ori	r4,r4,KERNELBASE@l
+	rlwinm	r6,r25,0,0x3ffffff		/* r6 = PHYS_START % 64M */
+	rlwinm	r5,r4,0,0x3ffffff		/* r5 = KERNELBASE % 64M */
+	subf	r3,r5,r6			/* r3 = r6 - r5 */
+	add	r3,r4,r3			/* Required Virtual Address */
+
+2:	bl	relocate
+
+	/*
+	 * For the second relocation, we already set the right tlb entries
+	 * for the kernel space, so skip the code in fsl_booke_entry_mapping.S
+	*/
+	cmpwi	r19,1
+	beq	set_ivor
+#endif
+
+/* We try to not make any assumptions about how the boot loader
+ * setup or used the TLBs.  We invalidate all mappings from the
+ * boot loader and load a single entry in TLB1[0] to map the
+ * first 64M of kernel memory.  Any boot info passed from the
+ * bootloader needs to live in this first 64M.
+ *
+ * Requirement on bootloader:
+ *  - The page we're executing in needs to reside in TLB1 and
+ *    have IPROT=1.  If not an invalidate broadcast could
+ *    evict the entry we're currently executing in.
+ *
+ *  r3 = Index of TLB1 were executing in
+ *  r4 = Current MSR[IS]
+ *  r5 = Index of TLB1 temp mapping
+ *
+ * Later in mapin_ram we will correctly map lowmem, and resize TLB1[0]
+ * if needed
+ */
+
+_ENTRY(__early_start)
+
+#define ENTRY_MAPPING_BOOT_SETUP
+#include "fsl_booke_entry_mapping.S"
+#undef ENTRY_MAPPING_BOOT_SETUP
+
+set_ivor:
+	/* Establish the interrupt vector offsets */
+	SET_IVOR(0,  CriticalInput);
+	SET_IVOR(1,  MachineCheck);
+	SET_IVOR(2,  DataStorage);
+	SET_IVOR(3,  InstructionStorage);
+	SET_IVOR(4,  ExternalInput);
+	SET_IVOR(5,  Alignment);
+	SET_IVOR(6,  Program);
+	SET_IVOR(7,  FloatingPointUnavailable);
+	SET_IVOR(8,  SystemCall);
+	SET_IVOR(9,  AuxillaryProcessorUnavailable);
+	SET_IVOR(10, Decrementer);
+	SET_IVOR(11, FixedIntervalTimer);
+	SET_IVOR(12, WatchdogTimer);
+	SET_IVOR(13, DataTLBError);
+	SET_IVOR(14, InstructionTLBError);
+	SET_IVOR(15, DebugCrit);
+
+	/* Establish the interrupt vector base */
+	lis	r4,interrupt_base@h	/* IVPR only uses the high 16-bits */
+	mtspr	SPRN_IVPR,r4
+
+	/* Setup the defaults for TLB entries */
+	li	r2,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l
+#ifdef CONFIG_E200
+	oris	r2,r2,MAS4_TLBSELD(1)@h
+#endif
+	mtspr	SPRN_MAS4, r2
+
+#if 0
+	/* Enable DOZE */
+	mfspr	r2,SPRN_HID0
+	oris	r2,r2,HID0_DOZE@h
+	mtspr	SPRN_HID0, r2
+#endif
+
+#if !defined(CONFIG_BDI_SWITCH)
+	/*
+	 * The Abatron BDI JTAG debugger does not tolerate others
+	 * mucking with the debug registers.
+	 */
+	lis	r2,DBCR0_IDM@h
+	mtspr	SPRN_DBCR0,r2
+	isync
+	/* clear any residual debug events */
+	li	r2,-1
+	mtspr	SPRN_DBSR,r2
+#endif
+
+#ifdef CONFIG_SMP
+	/* Check to see if we're the second processor, and jump
+	 * to the secondary_start code if so
+	 */
+	LOAD_REG_ADDR_PIC(r24, boot_cpuid)
+	lwz	r24, 0(r24)
+	cmpwi	r24, -1
+	mfspr   r24,SPRN_PIR
+	bne	__secondary_start
+#endif
+
+	/*
+	 * This is where the main kernel code starts.
+	 */
+
+	/* ptr to current */
+	lis	r2,init_task@h
+	ori	r2,r2,init_task@l
+
+	/* ptr to current thread */
+	addi	r4,r2,THREAD	/* init task's THREAD */
+	mtspr	SPRN_SPRG_THREAD,r4
+
+	/* stack */
+	lis	r1,init_thread_union@h
+	ori	r1,r1,init_thread_union@l
+	li	r0,0
+	stwu	r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
+
+	CURRENT_THREAD_INFO(r22, r1)
+	stw	r24, TI_CPU(r22)
+
+	bl	early_init
+
+#ifdef CONFIG_RELOCATABLE
+	mr	r3,r30
+	mr	r4,r31
+#ifdef CONFIG_PHYS_64BIT
+	mr	r5,r23
+	mr	r6,r25
+#else
+	mr	r5,r25
+#endif
+	bl	relocate_init
+#endif
+
+#ifdef CONFIG_DYNAMIC_MEMSTART
+	lis	r3,kernstart_addr@ha
+	la	r3,kernstart_addr@l(r3)
+#ifdef CONFIG_PHYS_64BIT
+	stw	r23,0(r3)
+	stw	r25,4(r3)
+#else
+	stw	r25,0(r3)
+#endif
+#endif
+
+/*
+ * Decide what sort of machine this is and initialize the MMU.
+ */
+	mr	r3,r30
+	mr	r4,r31
+	bl	machine_init
+	bl	MMU_init
+
+	/* Setup PTE pointers for the Abatron bdiGDB */
+	lis	r6, swapper_pg_dir@h
+	ori	r6, r6, swapper_pg_dir@l
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	lis	r4, KERNELBASE@h
+	ori	r4, r4, KERNELBASE@l
+	stw	r5, 0(r4)	/* Save abatron_pteptrs at a fixed location */
+	stw	r6, 0(r5)
+
+	/* Let's move on */
+	lis	r4,start_kernel@h
+	ori	r4,r4,start_kernel@l
+	lis	r3,MSR_KERNEL@h
+	ori	r3,r3,MSR_KERNEL@l
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r3
+	rfi			/* change context and jump to start_kernel */
+
+/* Macros to hide the PTE size differences
+ *
+ * FIND_PTE -- walks the page tables given EA & pgdir pointer
+ *   r10 -- EA of fault
+ *   r11 -- PGDIR pointer
+ *   r12 -- free
+ *   label 2: is the bailout case
+ *
+ * if we find the pte (fall through):
+ *   r11 is low pte word
+ *   r12 is pointer to the pte
+ *   r10 is the pshift from the PGD, if we're a hugepage
+ */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_HUGETLB_PAGE
+#define FIND_PTE	\
+	rlwinm	r12, r10, 13, 19, 29;	/* Compute pgdir/pmd offset */	\
+	lwzx	r11, r12, r11;		/* Get pgd/pmd entry */		\
+	rlwinm.	r12, r11, 0, 0, 20;	/* Extract pt base address */	\
+	blt	1000f;			/* Normal non-huge page */	\
+	beq	2f;			/* Bail if no table */		\
+	oris	r11, r11, PD_HUGE@h;	/* Put back address bit */	\
+	andi.	r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */	\
+	xor	r12, r10, r11;		/* drop size bits from pointer */ \
+	b	1001f;							\
+1000:	rlwimi	r12, r10, 23, 20, 28;	/* Compute pte address */	\
+	li	r10, 0;			/* clear r10 */			\
+1001:	lwz	r11, 4(r12);		/* Get pte entry */
+#else
+#define FIND_PTE	\
+	rlwinm	r12, r10, 13, 19, 29;	/* Compute pgdir/pmd offset */	\
+	lwzx	r11, r12, r11;		/* Get pgd/pmd entry */		\
+	rlwinm.	r12, r11, 0, 0, 20;	/* Extract pt base address */	\
+	beq	2f;			/* Bail if no table */		\
+	rlwimi	r12, r10, 23, 20, 28;	/* Compute pte address */	\
+	lwz	r11, 4(r12);		/* Get pte entry */
+#endif /* HUGEPAGE */
+#else /* !PTE_64BIT */
+#define FIND_PTE	\
+	rlwimi	r11, r10, 12, 20, 29;	/* Create L1 (pgdir/pmd) address */	\
+	lwz	r11, 0(r11);		/* Get L1 entry */			\
+	rlwinm.	r12, r11, 0, 0, 19;	/* Extract L2 (pte) base address */	\
+	beq	2f;			/* Bail if no table */			\
+	rlwimi	r12, r10, 22, 20, 29;	/* Compute PTE address */		\
+	lwz	r11, 0(r12);		/* Get Linux PTE */
+#endif
+
+/*
+ * Interrupt vector entry code
+ *
+ * The Book E MMUs are always on so we don't need to handle
+ * interrupts in real mode as with previous PPC processors. In
+ * this case we handle interrupts in the kernel virtual address
+ * space.
+ *
+ * Interrupt vectors are dynamically placed relative to the
+ * interrupt prefix as determined by the address of interrupt_base.
+ * The interrupt vectors offsets are programmed using the labels
+ * for each interrupt vector entry.
+ *
+ * Interrupt vectors must be aligned on a 16 byte boundary.
+ * We align on a 32 byte cache line boundary for good measure.
+ */
+
+interrupt_base:
+	/* Critical Input Interrupt */
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
+
+	/* Machine Check Interrupt */
+#ifdef CONFIG_E200
+	/* no RFMCI, MCSRRs on E200 */
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
+#else
+	MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+#endif
+
+	/* Data Storage Interrupt */
+	START_EXCEPTION(DataStorage)
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
+	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
+	stw	r5,_ESR(r11)
+	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
+	andis.	r10,r5,(ESR_ILK|ESR_DLK)@h
+	bne	1f
+	EXC_XFER_LITE(0x0300, handle_page_fault)
+1:
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE_LITE(0x0300, CacheLockingException)
+
+	/* Instruction Storage Interrupt */
+	INSTRUCTION_STORAGE_EXCEPTION
+
+	/* External Input Interrupt */
+	EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE)
+
+	/* Alignment Interrupt */
+	ALIGNMENT_EXCEPTION
+
+	/* Program Interrupt */
+	PROGRAM_EXCEPTION
+
+	/* Floating Point Unavailable Interrupt */
+#ifdef CONFIG_PPC_FPU
+	FP_UNAVAILABLE_EXCEPTION
+#else
+#ifdef CONFIG_E200
+	/* E200 treats 'normal' floating point instructions as FP Unavail exception */
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  program_check_exception, EXC_XFER_EE)
+#else
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
+#endif
+#endif
+
+	/* System Call Interrupt */
+	START_EXCEPTION(SystemCall)
+	NORMAL_EXCEPTION_PROLOG(SYSCALL)
+	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
+
+	/* Auxiliary Processor Unavailable Interrupt */
+	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
+
+	/* Decrementer Interrupt */
+	DECREMENTER_EXCEPTION
+
+	/* Fixed Internal Timer Interrupt */
+	/* TODO: Add FIT support */
+	EXCEPTION(0x3100, FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
+
+	/* Watchdog Timer Interrupt */
+#ifdef CONFIG_BOOKE_WDT
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException)
+#else
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception)
+#endif
+
+	/* Data TLB Error Interrupt */
+	START_EXCEPTION(DataTLBError)
+	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
+	mfspr	r10, SPRN_SPRG_THREAD
+	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+	stw	r12, THREAD_NORMSAVE(1)(r10)
+	stw	r13, THREAD_NORMSAVE(2)(r10)
+	mfcr	r13
+	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1
+START_BTB_FLUSH_SECTION
+	mfspr r11, SPRN_SRR1
+	andi. r10,r11,MSR_PR
+	beq 1f
+	BTB_FLUSH(r10)
+1:
+END_BTB_FLUSH_SECTION
+	mfspr	r10, SPRN_DEAR		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	5, r10, r11
+	blt	5, 3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+
+	mfspr	r12,SPRN_MAS1		/* Set TID to 0 */
+	rlwinm	r12,r12,0,16,1
+	mtspr	SPRN_MAS1,r12
+
+	b	4f
+
+	/* Get the PGD for the current thread */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+
+4:
+	/* Mask of required permission bits. Note that while we
+	 * do copy ESR:ST to _PAGE_RW position as trying to write
+	 * to an RO page is pretty common, we don't do it with
+	 * _PAGE_DIRTY. We could do it, but it's a fairly rare
+	 * event so I'd rather take the overhead when it happens
+	 * rather than adding an instruction here. We should measure
+	 * whether the whole thing is worth it in the first place
+	 * as we could avoid loading SPRN_ESR completely in the first
+	 * place...
+	 *
+	 * TODO: Is it worth doing that mfspr & rlwimi in the first
+	 *       place or can we save a couple of instructions here ?
+	 */
+	mfspr	r12,SPRN_ESR
+#ifdef CONFIG_PTE_64BIT
+	li	r13,_PAGE_PRESENT
+	oris	r13,r13,_PAGE_ACCESSED@h
+#else
+	li	r13,_PAGE_PRESENT|_PAGE_ACCESSED
+#endif
+	rlwimi	r13,r12,11,29,29
+
+	FIND_PTE
+	andc.	r13,r13,r11		/* Check permission */
+
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r13,r11,r12		/* create false data dep */
+	lwzx	r13,r11,r13		/* Get upper pte bits */
+#else
+	lwz	r13,0(r12)		/* Get upper pte bits */
+#endif
+#endif
+
+	bne	2f			/* Bail if permission/valid mismach */
+
+	/* Jump to common tlb load */
+	b	finish_tlb_load
+2:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+	mfspr	r10, SPRN_SPRG_THREAD
+	lwz	r11, THREAD_NORMSAVE(3)(r10)
+	mtcr	r11
+	lwz	r13, THREAD_NORMSAVE(2)(r10)
+	lwz	r12, THREAD_NORMSAVE(1)(r10)
+	lwz	r11, THREAD_NORMSAVE(0)(r10)
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	b	DataStorage
+
+	/* Instruction TLB Error Interrupt */
+	/*
+	 * Nearly the same as above, except we get our
+	 * information from different registers and bailout
+	 * to a different point.
+	 */
+	START_EXCEPTION(InstructionTLBError)
+	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
+	mfspr	r10, SPRN_SPRG_THREAD
+	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+	stw	r12, THREAD_NORMSAVE(1)(r10)
+	stw	r13, THREAD_NORMSAVE(2)(r10)
+	mfcr	r13
+	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1
+START_BTB_FLUSH_SECTION
+	mfspr r11, SPRN_SRR1
+	andi. r10,r11,MSR_PR
+	beq 1f
+	BTB_FLUSH(r10)
+1:
+END_BTB_FLUSH_SECTION
+
+	mfspr	r10, SPRN_SRR0		/* Get faulting address */
+
+	/* If we are faulting a kernel address, we have to use the
+	 * kernel page tables.
+	 */
+	lis	r11, PAGE_OFFSET@h
+	cmplw	5, r10, r11
+	blt	5, 3f
+	lis	r11, swapper_pg_dir@h
+	ori	r11, r11, swapper_pg_dir@l
+
+	mfspr	r12,SPRN_MAS1		/* Set TID to 0 */
+	rlwinm	r12,r12,0,16,1
+	mtspr	SPRN_MAS1,r12
+
+	/* Make up the required permissions for kernel code */
+#ifdef CONFIG_PTE_64BIT
+	li	r13,_PAGE_PRESENT | _PAGE_BAP_SX
+	oris	r13,r13,_PAGE_ACCESSED@h
+#else
+	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
+#endif
+	b	4f
+
+	/* Get the PGD for the current thread */
+3:
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,PGDIR(r11)
+
+	/* Make up the required permissions for user code */
+#ifdef CONFIG_PTE_64BIT
+	li	r13,_PAGE_PRESENT | _PAGE_BAP_UX
+	oris	r13,r13,_PAGE_ACCESSED@h
+#else
+	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
+#endif
+
+4:
+	FIND_PTE
+	andc.	r13,r13,r11		/* Check permission */
+
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r13,r11,r12		/* create false data dep */
+	lwzx	r13,r11,r13		/* Get upper pte bits */
+#else
+	lwz	r13,0(r12)		/* Get upper pte bits */
+#endif
+#endif
+
+	bne	2f			/* Bail if permission mismach */
+
+	/* Jump to common TLB load point */
+	b	finish_tlb_load
+
+2:
+	/* The bailout.  Restore registers to pre-exception conditions
+	 * and call the heavyweights to help us out.
+	 */
+	mfspr	r10, SPRN_SPRG_THREAD
+	lwz	r11, THREAD_NORMSAVE(3)(r10)
+	mtcr	r11
+	lwz	r13, THREAD_NORMSAVE(2)(r10)
+	lwz	r12, THREAD_NORMSAVE(1)(r10)
+	lwz	r11, THREAD_NORMSAVE(0)(r10)
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	b	InstructionStorage
+
+/* Define SPE handlers for e200 and e500v2 */
+#ifdef CONFIG_SPE
+	/* SPE Unavailable */
+	START_EXCEPTION(SPEUnavailable)
+	NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL)
+	beq	1f
+	bl	load_up_spe
+	b	fast_exception_return
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	EXC_XFER_EE_LITE(0x2010, KernelSPE)
+#elif defined(CONFIG_SPE_POSSIBLE)
+	EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
+#endif /* CONFIG_SPE_POSSIBLE */
+
+	/* SPE Floating Point Data */
+#ifdef CONFIG_SPE
+	EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData,
+		  SPEFloatingPointException, EXC_XFER_EE)
+
+	/* SPE Floating Point Round */
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  SPEFloatingPointRoundException, EXC_XFER_EE)
+#elif defined(CONFIG_SPE_POSSIBLE)
+	EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData,
+		  unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  unknown_exception, EXC_XFER_EE)
+#endif /* CONFIG_SPE_POSSIBLE */
+
+
+	/* Performance Monitor */
+	EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \
+		  performance_monitor_exception, EXC_XFER_STD)
+
+	EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD)
+
+	CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \
+			   CriticalDoorbell, unknown_exception)
+
+	/* Debug Interrupt */
+	DEBUG_DEBUG_EXCEPTION
+	DEBUG_CRIT_EXCEPTION
+
+	GUEST_DOORBELL_EXCEPTION
+
+	CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \
+			   unknown_exception)
+
+	/* Hypercall */
+	EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE)
+
+	/* Embedded Hypervisor Privilege */
+	EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)
+
+interrupt_end:
+
+/*
+ * Local functions
+ */
+
+/*
+ * Both the instruction and data TLB miss get to this
+ * point to load the TLB.
+ *	r10 - tsize encoding (if HUGETLB_PAGE) or available to use
+ *	r11 - TLB (info from Linux PTE)
+ *	r12 - available to use
+ *	r13 - upper bits of PTE (if PTE_64BIT) or available to use
+ *	CR5 - results of addr >= PAGE_OFFSET
+ *	MAS0, MAS1 - loaded with proper value when we get here
+ *	MAS2, MAS3 - will need additional info from Linux PTE
+ *	Upon exit, we reload everything and RFI.
+ */
+finish_tlb_load:
+#ifdef CONFIG_HUGETLB_PAGE
+	cmpwi	6, r10, 0			/* check for huge page */
+	beq	6, finish_tlb_load_cont    	/* !huge */
+
+	/* Alas, we need more scratch registers for hugepages */
+	mfspr	r12, SPRN_SPRG_THREAD
+	stw	r14, THREAD_NORMSAVE(4)(r12)
+	stw	r15, THREAD_NORMSAVE(5)(r12)
+	stw	r16, THREAD_NORMSAVE(6)(r12)
+	stw	r17, THREAD_NORMSAVE(7)(r12)
+
+	/* Get the next_tlbcam_idx percpu var */
+#ifdef CONFIG_SMP
+	lwz	r12, THREAD_INFO-THREAD(r12)
+	lwz	r15, TI_CPU(r12)
+	lis     r14, __per_cpu_offset@h
+	ori     r14, r14, __per_cpu_offset@l
+	rlwinm  r15, r15, 2, 0, 29
+	lwzx    r16, r14, r15
+#else
+	li	r16, 0
+#endif
+	lis     r17, next_tlbcam_idx@h
+	ori	r17, r17, next_tlbcam_idx@l
+	add	r17, r17, r16			/* r17 = *next_tlbcam_idx */
+	lwz     r15, 0(r17)			/* r15 = next_tlbcam_idx */
+
+	lis	r14, MAS0_TLBSEL(1)@h		/* select TLB1 (TLBCAM) */
+	rlwimi	r14, r15, 16, 4, 15		/* next_tlbcam_idx entry */
+	mtspr	SPRN_MAS0, r14
+
+	/* Extract TLB1CFG(NENTRY) */
+	mfspr	r16, SPRN_TLB1CFG
+	andi.	r16, r16, 0xfff
+
+	/* Update next_tlbcam_idx, wrapping when necessary */
+	addi	r15, r15, 1
+	cmpw	r15, r16
+	blt 	100f
+	lis	r14, tlbcam_index@h
+	ori	r14, r14, tlbcam_index@l
+	lwz	r15, 0(r14)
+100:	stw	r15, 0(r17)
+
+	/*
+	 * Calc MAS1_TSIZE from r10 (which has pshift encoded)
+	 * tlb_enc = (pshift - 10).
+	 */
+	subi	r15, r10, 10
+	mfspr	r16, SPRN_MAS1
+	rlwimi	r16, r15, 7, 20, 24
+	mtspr	SPRN_MAS1, r16
+
+	/* copy the pshift for use later */
+	mr	r14, r10
+
+	/* fall through */
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+	/*
+	 * We set execute, because we don't have the granularity to
+	 * properly set this at the page level (Linux problem).
+	 * Many of these bits are software only.  Bits we don't set
+	 * here we (properly should) assume have the appropriate value.
+	 */
+finish_tlb_load_cont:
+#ifdef CONFIG_PTE_64BIT
+	rlwinm	r12, r11, 32-2, 26, 31	/* Move in perm bits */
+	andi.	r10, r11, _PAGE_DIRTY
+	bne	1f
+	li	r10, MAS3_SW | MAS3_UW
+	andc	r12, r12, r10
+1:	rlwimi	r12, r13, 20, 0, 11	/* grab RPN[32:43] */
+	rlwimi	r12, r11, 20, 12, 19	/* grab RPN[44:51] */
+2:	mtspr	SPRN_MAS3, r12
+BEGIN_MMU_FTR_SECTION
+	srwi	r10, r13, 12		/* grab RPN[12:31] */
+	mtspr	SPRN_MAS7, r10
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
+#else
+	li	r10, (_PAGE_EXEC | _PAGE_PRESENT)
+	mr	r13, r11
+	rlwimi	r10, r11, 31, 29, 29	/* extract _PAGE_DIRTY into SW */
+	and	r12, r11, r10
+	andi.	r10, r11, _PAGE_USER	/* Test for _PAGE_USER */
+	slwi	r10, r12, 1
+	or	r10, r10, r12
+	iseleq	r12, r12, r10
+	rlwimi	r13, r12, 0, 20, 31	/* Get RPN from PTE, merge w/ perms */
+	mtspr	SPRN_MAS3, r13
+#endif
+
+	mfspr	r12, SPRN_MAS2
+#ifdef CONFIG_PTE_64BIT
+	rlwimi	r12, r11, 32-19, 27, 31	/* extract WIMGE from pte */
+#else
+	rlwimi	r12, r11, 26, 27, 31	/* extract WIMGE from pte */
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+	beq	6, 3f			/* don't mask if page isn't huge */
+	li	r13, 1
+	slw	r13, r13, r14
+	subi	r13, r13, 1
+	rlwinm	r13, r13, 0, 0, 19	/* bottom bits used for WIMGE/etc */
+	andc	r12, r12, r13		/* mask off ea bits within the page */
+#endif
+3:	mtspr	SPRN_MAS2, r12
+
+#ifdef CONFIG_E200
+	/* Round robin TLB1 entries assignment */
+	mfspr	r12, SPRN_MAS0
+
+	/* Extract TLB1CFG(NENTRY) */
+	mfspr	r11, SPRN_TLB1CFG
+	andi.	r11, r11, 0xfff
+
+	/* Extract MAS0(NV) */
+	andi.	r13, r12, 0xfff
+	addi	r13, r13, 1
+	cmpw	0, r13, r11
+	addi	r12, r12, 1
+
+	/* check if we need to wrap */
+	blt	7f
+
+	/* wrap back to first free tlbcam entry */
+	lis	r13, tlbcam_index@ha
+	lwz	r13, tlbcam_index@l(r13)
+	rlwimi	r12, r13, 0, 20, 31
+7:
+	mtspr	SPRN_MAS0,r12
+#endif /* CONFIG_E200 */
+
+tlb_write_entry:
+	tlbwe
+
+	/* Done...restore registers and get out of here.  */
+	mfspr	r10, SPRN_SPRG_THREAD
+#ifdef CONFIG_HUGETLB_PAGE
+	beq	6, 8f /* skip restore for 4k page faults */
+	lwz	r14, THREAD_NORMSAVE(4)(r10)
+	lwz	r15, THREAD_NORMSAVE(5)(r10)
+	lwz	r16, THREAD_NORMSAVE(6)(r10)
+	lwz	r17, THREAD_NORMSAVE(7)(r10)
+#endif
+8:	lwz	r11, THREAD_NORMSAVE(3)(r10)
+	mtcr	r11
+	lwz	r13, THREAD_NORMSAVE(2)(r10)
+	lwz	r12, THREAD_NORMSAVE(1)(r10)
+	lwz	r11, THREAD_NORMSAVE(0)(r10)
+	mfspr	r10, SPRN_SPRG_RSCRATCH0
+	rfi					/* Force context change */
+
+#ifdef CONFIG_SPE
+/* Note that the SPE support is closely modeled after the AltiVec
+ * support.  Changes to one are likely to be applicable to the
+ * other!  */
+_GLOBAL(load_up_spe)
+/*
+ * Disable SPE for the task which had SPE previously,
+ * and save its SPE registers in its thread_struct.
+ * Enables SPE for use in the kernel on return.
+ * On SMP we know the SPE units are free, since we give it up every
+ * switch.  -- Kumar
+ */
+	mfmsr	r5
+	oris	r5,r5,MSR_SPE@h
+	mtmsr	r5			/* enable use of SPE now */
+	isync
+	/* enable use of SPE after return */
+	oris	r9,r9,MSR_SPE@h
+	mfspr	r5,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
+	li	r4,1
+	li	r10,THREAD_ACC
+	stw	r4,THREAD_USED_SPE(r5)
+	evlddx	evr4,r10,r5
+	evmra	evr4,evr4
+	REST_32EVRS(0,r10,r5,THREAD_EVR0)
+	blr
+
+/*
+ * SPE unavailable trap from kernel - print a message, but let
+ * the task use SPE in the kernel until it returns to user mode.
+ */
+KernelSPE:
+	lwz	r3,_MSR(r1)
+	oris	r3,r3,MSR_SPE@h
+	stw	r3,_MSR(r1)	/* enable use of SPE after return */
+#ifdef CONFIG_PRINTK
+	lis	r3,87f@h
+	ori	r3,r3,87f@l
+	mr	r4,r2		/* current */
+	lwz	r5,_NIP(r1)
+	bl	printk
+#endif
+	b	ret_from_except
+#ifdef CONFIG_PRINTK
+87:	.string	"SPE used in kernel  (task=%p, pc=%x)  \n"
+#endif
+	.align	4,0
+
+#endif /* CONFIG_SPE */
+
+/*
+ * Translate the effec addr in r3 to phys addr. The phys addr will be put
+ * into r3(higher 32bit) and r4(lower 32bit)
+ */
+get_phys_addr:
+	mfmsr	r8
+	mfspr	r9,SPRN_PID
+	rlwinm	r9,r9,16,0x3fff0000	/* turn PID into MAS6[SPID] */
+	rlwimi	r9,r8,28,0x00000001	/* turn MSR[DS] into MAS6[SAS] */
+	mtspr	SPRN_MAS6,r9
+
+	tlbsx	0,r3			/* must succeed */
+
+	mfspr	r8,SPRN_MAS1
+	mfspr	r12,SPRN_MAS3
+	rlwinm	r9,r8,25,0x1f		/* r9 = log2(page size) */
+	li	r10,1024
+	slw	r10,r10,r9		/* r10 = page size */
+	addi	r10,r10,-1
+	and	r11,r3,r10		/* r11 = page offset */
+	andc	r4,r12,r10		/* r4 = page base */
+	or	r4,r4,r11		/* r4 = devtree phys addr */
+#ifdef CONFIG_PHYS_64BIT
+	mfspr	r3,SPRN_MAS7
+#endif
+	blr
+
+/*
+ * Global functions
+ */
+
+#ifdef CONFIG_E200
+/* Adjust or setup IVORs for e200 */
+_GLOBAL(__setup_e200_ivors)
+	li	r3,DebugDebug@l
+	mtspr	SPRN_IVOR15,r3
+	li	r3,SPEUnavailable@l
+	mtspr	SPRN_IVOR32,r3
+	li	r3,SPEFloatingPointData@l
+	mtspr	SPRN_IVOR33,r3
+	li	r3,SPEFloatingPointRound@l
+	mtspr	SPRN_IVOR34,r3
+	sync
+	blr
+#endif
+
+#ifdef CONFIG_E500
+#ifndef CONFIG_PPC_E500MC
+/* Adjust or setup IVORs for e500v1/v2 */
+_GLOBAL(__setup_e500_ivors)
+	li	r3,DebugCrit@l
+	mtspr	SPRN_IVOR15,r3
+	li	r3,SPEUnavailable@l
+	mtspr	SPRN_IVOR32,r3
+	li	r3,SPEFloatingPointData@l
+	mtspr	SPRN_IVOR33,r3
+	li	r3,SPEFloatingPointRound@l
+	mtspr	SPRN_IVOR34,r3
+	li	r3,PerformanceMonitor@l
+	mtspr	SPRN_IVOR35,r3
+	sync
+	blr
+#else
+/* Adjust or setup IVORs for e500mc */
+_GLOBAL(__setup_e500mc_ivors)
+	li	r3,DebugDebug@l
+	mtspr	SPRN_IVOR15,r3
+	li	r3,PerformanceMonitor@l
+	mtspr	SPRN_IVOR35,r3
+	li	r3,Doorbell@l
+	mtspr	SPRN_IVOR36,r3
+	li	r3,CriticalDoorbell@l
+	mtspr	SPRN_IVOR37,r3
+	sync
+	blr
+
+/* setup ehv ivors for */
+_GLOBAL(__setup_ehv_ivors)
+	li	r3,GuestDoorbell@l
+	mtspr	SPRN_IVOR38,r3
+	li	r3,CriticalGuestDoorbell@l
+	mtspr	SPRN_IVOR39,r3
+	li	r3,Hypercall@l
+	mtspr	SPRN_IVOR40,r3
+	li	r3,Ehvpriv@l
+	mtspr	SPRN_IVOR41,r3
+	sync
+	blr
+#endif /* CONFIG_PPC_E500MC */
+#endif /* CONFIG_E500 */
+
+#ifdef CONFIG_SPE
+/*
+ * extern void __giveup_spe(struct task_struct *prev)
+ *
+ */
+_GLOBAL(__giveup_spe)
+	addi	r3,r3,THREAD		/* want THREAD of task */
+	lwz	r5,PT_REGS(r3)
+	cmpi	0,r5,0
+	SAVE_32EVRS(0, r4, r3, THREAD_EVR0)
+	evxor	evr6, evr6, evr6	/* clear out evr6 */
+	evmwumiaa evr6, evr6, evr6	/* evr6 <- ACC = 0 * 0 + ACC */
+	li	r4,THREAD_ACC
+	evstddx	evr6, r4, r3		/* save off accumulator */
+	beq	1f
+	lwz	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	lis	r3,MSR_SPE@h
+	andc	r4,r4,r3		/* disable SPE for previous task */
+	stw	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+	blr
+#endif /* CONFIG_SPE */
+
+/*
+ * extern void abort(void)
+ *
+ * At present, this routine just applies a system reset.
+ */
+_GLOBAL(abort)
+	li	r13,0
+	mtspr	SPRN_DBCR0,r13		/* disable all debug events */
+	isync
+	mfmsr	r13
+	ori	r13,r13,MSR_DE@l	/* Enable Debug Events */
+	mtmsr	r13
+	isync
+	mfspr	r13,SPRN_DBCR0
+	lis	r13,(DBCR0_IDM|DBCR0_RST_CHIP)@h
+	mtspr	SPRN_DBCR0,r13
+	isync
+
+_GLOBAL(set_context)
+
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is the second parameter.
+	 */
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r4, 0x4(r5)
+#endif
+	mtspr	SPRN_PID,r3
+	isync			/* Force context change */
+	blr
+
+#ifdef CONFIG_SMP
+/* When we get here, r24 needs to hold the CPU # */
+	.globl __secondary_start
+__secondary_start:
+	LOAD_REG_ADDR_PIC(r3, tlbcam_index)
+	lwz	r3,0(r3)
+	mtctr	r3
+	li	r26,0		/* r26 safe? */
+
+	bl	switch_to_as1
+	mr	r27,r3		/* tlb entry */
+	/* Load each CAM entry */
+1:	mr	r3,r26
+	bl	loadcam_entry
+	addi	r26,r26,1
+	bdnz	1b
+	mr	r3,r27		/* tlb entry */
+	LOAD_REG_ADDR_PIC(r4, memstart_addr)
+	lwz	r4,0(r4)
+	mr	r5,r25		/* phys kernel start */
+	rlwinm	r5,r5,0,~0x3ffffff	/* aligned 64M */
+	subf	r4,r5,r4	/* memstart_addr - phys kernel start */
+	li	r5,0		/* no device tree */
+	li	r6,0		/* not boot cpu */
+	bl	restore_to_as0
+
+
+	lis	r3,__secondary_hold_acknowledge@h
+	ori	r3,r3,__secondary_hold_acknowledge@l
+	stw	r24,0(r3)
+
+	li	r3,0
+	mr	r4,r24		/* Why? */
+	bl	call_setup_cpu
+
+	/* get current_thread_info and current */
+	lis	r1,secondary_ti@ha
+	lwz	r1,secondary_ti@l(r1)
+	lwz	r2,TI_TASK(r1)
+
+	/* stack */
+	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	li	r0,0
+	stw	r0,0(r1)
+
+	/* ptr to current thread */
+	addi	r4,r2,THREAD	/* address of our thread_struct */
+	mtspr	SPRN_SPRG_THREAD,r4
+
+	/* Setup the defaults for TLB entries */
+	li	r4,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l
+	mtspr	SPRN_MAS4,r4
+
+	/* Jump to start_secondary */
+	lis	r4,MSR_KERNEL@h
+	ori	r4,r4,MSR_KERNEL@l
+	lis	r3,start_secondary@h
+	ori	r3,r3,start_secondary@l
+	mtspr	SPRN_SRR0,r3
+	mtspr	SPRN_SRR1,r4
+	sync
+	rfi
+	sync
+
+	.globl __secondary_hold_acknowledge
+__secondary_hold_acknowledge:
+	.long	-1
+#endif
+
+/*
+ * Create a tlb entry with the same effective and physical address as
+ * the tlb entry used by the current running code. But set the TS to 1.
+ * Then switch to the address space 1. It will return with the r3 set to
+ * the ESEL of the new created tlb.
+ */
+_GLOBAL(switch_to_as1)
+	mflr	r5
+
+	/* Find a entry not used */
+	mfspr	r3,SPRN_TLB1CFG
+	andi.	r3,r3,0xfff
+	mfspr	r4,SPRN_PID
+	rlwinm	r4,r4,16,0x3fff0000	/* turn PID into MAS6[SPID] */
+	mtspr	SPRN_MAS6,r4
+1:	lis	r4,0x1000		/* Set MAS0(TLBSEL) = 1 */
+	addi	r3,r3,-1
+	rlwimi	r4,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r4
+	tlbre
+	mfspr	r4,SPRN_MAS1
+	andis.	r4,r4,MAS1_VALID@h
+	bne	1b
+
+	/* Get the tlb entry used by the current running code */
+	bl	0f
+0:	mflr	r4
+	tlbsx	0,r4
+
+	mfspr	r4,SPRN_MAS1
+	ori	r4,r4,MAS1_TS		/* Set the TS = 1 */
+	mtspr	SPRN_MAS1,r4
+
+	mfspr	r4,SPRN_MAS0
+	rlwinm	r4,r4,0,~MAS0_ESEL_MASK
+	rlwimi	r4,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r4
+	tlbwe
+	isync
+	sync
+
+	mfmsr	r4
+	ori	r4,r4,MSR_IS | MSR_DS
+	mtspr	SPRN_SRR0,r5
+	mtspr	SPRN_SRR1,r4
+	sync
+	rfi
+
+/*
+ * Restore to the address space 0 and also invalidate the tlb entry created
+ * by switch_to_as1.
+ * r3 - the tlb entry which should be invalidated
+ * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0)
+ * r5 - device tree virtual address. If r4 is 0, r5 is ignored.
+ * r6 - boot cpu
+*/
+_GLOBAL(restore_to_as0)
+	mflr	r0
+
+	bl	0f
+0:	mflr	r9
+	addi	r9,r9,1f - 0b
+
+	/*
+	 * We may map the PAGE_OFFSET in AS0 to a different physical address,
+	 * so we need calculate the right jump and device tree address based
+	 * on the offset passed by r4.
+	 */
+	add	r9,r9,r4
+	add	r5,r5,r4
+	add	r0,r0,r4
+
+2:	mfmsr	r7
+	li	r8,(MSR_IS | MSR_DS)
+	andc	r7,r7,r8
+
+	mtspr	SPRN_SRR0,r9
+	mtspr	SPRN_SRR1,r7
+	sync
+	rfi
+
+	/* Invalidate the temporary tlb entry for AS1 */
+1:	lis	r9,0x1000		/* Set MAS0(TLBSEL) = 1 */
+	rlwimi	r9,r3,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r3) */
+	mtspr	SPRN_MAS0,r9
+	tlbre
+	mfspr	r9,SPRN_MAS1
+	rlwinm	r9,r9,0,2,31		/* Clear MAS1 Valid and IPPROT */
+	mtspr	SPRN_MAS1,r9
+	tlbwe
+	isync
+
+	cmpwi	r4,0
+	cmpwi	cr1,r6,0
+	cror	eq,4*cr1+eq,eq
+	bne	3f			/* offset != 0 && is_boot_cpu */
+	mtlr	r0
+	blr
+
+	/*
+	 * The PAGE_OFFSET will map to a different physical address,
+	 * jump to _start to do another relocation again.
+	*/
+3:	mr	r3,r5
+	bl	_start
+
+/*
+ * We put a few things here that have to be page-aligned. This stuff
+ * goes at the beginning of the data segment, which is page-aligned.
+ */
+	.data
+	.align	12
+	.globl	sdata
+sdata:
+	.globl	empty_zero_page
+empty_zero_page:
+	.space	4096
+EXPORT_SYMBOL(empty_zero_page)
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PGD_TABLE_SIZE
+
+/*
+ * Room for two PTE pointers, usually the kernel and current user pointers
+ * to their respective root page table.
+ */
+abatron_pteptrs:
+	.space	8
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
new file mode 100644
index 000000000..fec8a6773
--- /dev/null
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -0,0 +1,378 @@
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers. Derived from
+ * "arch/x86/kernel/hw_breakpoint.c"
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2010 IBM Corporation
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
+ *
+ */
+
+#include <linux/hw_breakpoint.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/sstep.h>
+#include <asm/debug.h>
+#include <linux/uaccess.h>
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for every cpu
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg);
+
+/*
+ * Returns total number of data or instruction breakpoints available.
+ */
+int hw_breakpoint_slots(int type)
+{
+	if (type == TYPE_DATA)
+		return HBP_NUM;
+	return 0;		/* no instruction breakpoints available */
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	struct perf_event **slot = this_cpu_ptr(&bp_per_reg);
+
+	*slot = bp;
+
+	/*
+	 * Do not install DABR values if the instruction must be single-stepped.
+	 * If so, DABR will be populated in single_step_dabr_instruction().
+	 */
+	if (current->thread.last_hit_ubp != bp)
+		__set_breakpoint(info);
+
+	return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+	struct perf_event **slot = this_cpu_ptr(&bp_per_reg);
+
+	if (*slot != bp) {
+		WARN_ONCE(1, "Can't find the breakpoint");
+		return;
+	}
+
+	*slot = NULL;
+	hw_breakpoint_disable();
+}
+
+/*
+ * Perform cleanup of arch-specific counters during unregistration
+ * of the perf-event
+ */
+void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+	/*
+	 * If the breakpoint is unregistered between a hw_breakpoint_handler()
+	 * and the single_step_dabr_instruction(), then cleanup the breakpoint
+	 * restoration variables to prevent dangling pointers.
+	 * FIXME, this should not be using bp->ctx at all! Sayeth peterz.
+	 */
+	if (bp->ctx && bp->ctx->task && bp->ctx->task != ((void *)-1L))
+		bp->ctx->task->thread.last_hit_ubp = NULL;
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
+{
+	return is_kernel_addr(hw->address);
+}
+
+int arch_bp_generic_fields(int type, int *gen_bp_type)
+{
+	*gen_bp_type = 0;
+	if (type & HW_BRK_TYPE_READ)
+		*gen_bp_type |= HW_BREAKPOINT_R;
+	if (type & HW_BRK_TYPE_WRITE)
+		*gen_bp_type |= HW_BREAKPOINT_W;
+	if (*gen_bp_type == 0)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+			     const struct perf_event_attr *attr,
+			     struct arch_hw_breakpoint *hw)
+{
+	int ret = -EINVAL, length_max;
+
+	if (!bp)
+		return ret;
+
+	hw->type = HW_BRK_TYPE_TRANSLATE;
+	if (attr->bp_type & HW_BREAKPOINT_R)
+		hw->type |= HW_BRK_TYPE_READ;
+	if (attr->bp_type & HW_BREAKPOINT_W)
+		hw->type |= HW_BRK_TYPE_WRITE;
+	if (hw->type == HW_BRK_TYPE_TRANSLATE)
+		/* must set alteast read or write */
+		return ret;
+	if (!attr->exclude_user)
+		hw->type |= HW_BRK_TYPE_USER;
+	if (!attr->exclude_kernel)
+		hw->type |= HW_BRK_TYPE_KERNEL;
+	if (!attr->exclude_hv)
+		hw->type |= HW_BRK_TYPE_HYP;
+	hw->address = attr->bp_addr;
+	hw->len = attr->bp_len;
+
+	/*
+	 * Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8)
+	 * and breakpoint addresses are aligned to nearest double-word
+	 * HW_BREAKPOINT_ALIGN by rounding off to the lower address, the
+	 * 'symbolsize' should satisfy the check below.
+	 */
+	if (!ppc_breakpoint_available())
+		return -ENODEV;
+	length_max = 8; /* DABR */
+	if (cpu_has_feature(CPU_FTR_DAWR)) {
+		length_max = 512 ; /* 64 doublewords */
+		/* DAWR region can't cross 512 boundary */
+		if ((attr->bp_addr >> 9) !=
+		    ((attr->bp_addr + attr->bp_len - 1) >> 9))
+			return -EINVAL;
+	}
+	if (hw->len >
+	    (length_max - (hw->address & HW_BREAKPOINT_ALIGN)))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Restores the breakpoint on the debug registers.
+ * Invoke this function if it is known that the execution context is
+ * about to change to cause loss of MSR_SE settings.
+ */
+void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
+{
+	struct arch_hw_breakpoint *info;
+
+	if (likely(!tsk->thread.last_hit_ubp))
+		return;
+
+	info = counter_arch_bp(tsk->thread.last_hit_ubp);
+	regs->msr &= ~MSR_SE;
+	__set_breakpoint(info);
+	tsk->thread.last_hit_ubp = NULL;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int hw_breakpoint_handler(struct die_args *args)
+{
+	int rc = NOTIFY_STOP;
+	struct perf_event *bp;
+	struct pt_regs *regs = args->regs;
+#ifndef CONFIG_PPC_8xx
+	int stepped = 1;
+	unsigned int instr;
+#endif
+	struct arch_hw_breakpoint *info;
+	unsigned long dar = regs->dar;
+
+	/* Disable breakpoints during exception handling */
+	hw_breakpoint_disable();
+
+	/*
+	 * The counter may be concurrently released but that can only
+	 * occur from a call_rcu() path. We can then safely fetch
+	 * the breakpoint, use its callback, touch its counter
+	 * while we are in an rcu_read_lock() path.
+	 */
+	rcu_read_lock();
+
+	bp = __this_cpu_read(bp_per_reg);
+	if (!bp) {
+		rc = NOTIFY_DONE;
+		goto out;
+	}
+	info = counter_arch_bp(bp);
+
+	/*
+	 * Return early after invoking user-callback function without restoring
+	 * DABR if the breakpoint is from ptrace which always operates in
+	 * one-shot mode. The ptrace-ed process will receive the SIGTRAP signal
+	 * generated in do_dabr().
+	 */
+	if (bp->overflow_handler == ptrace_triggered) {
+		perf_bp_event(bp, regs);
+		rc = NOTIFY_DONE;
+		goto out;
+	}
+
+	/*
+	 * Verify if dar lies within the address range occupied by the symbol
+	 * being watched to filter extraneous exceptions.  If it doesn't,
+	 * we still need to single-step the instruction, but we don't
+	 * generate an event.
+	 */
+	info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
+	if (!((bp->attr.bp_addr <= dar) &&
+	      (dar - bp->attr.bp_addr < bp->attr.bp_len)))
+		info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
+
+#ifndef CONFIG_PPC_8xx
+	/* Do not emulate user-space instructions, instead single-step them */
+	if (user_mode(regs)) {
+		current->thread.last_hit_ubp = bp;
+		regs->msr |= MSR_SE;
+		goto out;
+	}
+
+	stepped = 0;
+	instr = 0;
+	if (!__get_user_inatomic(instr, (unsigned int *) regs->nip))
+		stepped = emulate_step(regs, instr);
+
+	/*
+	 * emulate_step() could not execute it. We've failed in reliably
+	 * handling the hw-breakpoint. Unregister it and throw a warning
+	 * message to let the user know about it.
+	 */
+	if (!stepped) {
+		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
+			"0x%lx will be disabled.", info->address);
+		perf_event_disable_inatomic(bp);
+		goto out;
+	}
+#endif
+	/*
+	 * As a policy, the callback is invoked in a 'trigger-after-execute'
+	 * fashion
+	 */
+	if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
+		perf_bp_event(bp, regs);
+
+	__set_breakpoint(info);
+out:
+	rcu_read_unlock();
+	return rc;
+}
+NOKPROBE_SYMBOL(hw_breakpoint_handler);
+
+/*
+ * Handle single-step exceptions following a DABR hit.
+ */
+static int single_step_dabr_instruction(struct die_args *args)
+{
+	struct pt_regs *regs = args->regs;
+	struct perf_event *bp = NULL;
+	struct arch_hw_breakpoint *info;
+
+	bp = current->thread.last_hit_ubp;
+	/*
+	 * Check if we are single-stepping as a result of a
+	 * previous HW Breakpoint exception
+	 */
+	if (!bp)
+		return NOTIFY_DONE;
+
+	info = counter_arch_bp(bp);
+
+	/*
+	 * We shall invoke the user-defined callback function in the single
+	 * stepping handler to confirm to 'trigger-after-execute' semantics
+	 */
+	if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
+		perf_bp_event(bp, regs);
+
+	__set_breakpoint(info);
+	current->thread.last_hit_ubp = NULL;
+
+	/*
+	 * If the process was being single-stepped by ptrace, let the
+	 * other single-step actions occur (e.g. generate SIGTRAP).
+	 */
+	if (test_thread_flag(TIF_SINGLESTEP))
+		return NOTIFY_DONE;
+
+	return NOTIFY_STOP;
+}
+NOKPROBE_SYMBOL(single_step_dabr_instruction);
+
+/*
+ * Handle debug exception notifications.
+ */
+int hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	int ret = NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_DABR_MATCH:
+		ret = hw_breakpoint_handler(data);
+		break;
+	case DIE_SSTEP:
+		ret = single_step_dabr_instruction(data);
+		break;
+	}
+
+	return ret;
+}
+NOKPROBE_SYMBOL(hw_breakpoint_exceptions_notify);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_struct *t = &tsk->thread;
+
+	unregister_hw_breakpoint(t->ptrace_bps[0]);
+	t->ptrace_bps[0] = NULL;
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+	/* TODO */
+}
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
new file mode 100644
index 000000000..ca79aacfe
--- /dev/null
+++ b/arch/powerpc/kernel/idle.c
@@ -0,0 +1,115 @@
+/*
+ * Idle daemon for PowerPC.  Idle daemon will handle any action
+ * that needs to be taken when the system becomes idle.
+ *
+ * Originally written by Cort Dougan (cort@cs.nmt.edu).
+ * Subsequent 32-bit hacking by Tom Rini, Armin Kuster,
+ * Paul Mackerras and others.
+ *
+ * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * Additional shared processor, SMT, and firmware support
+ *    Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *
+ * 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/sysctl.h>
+#include <linux/tick.h>
+
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/time.h>
+#include <asm/machdep.h>
+#include <asm/runlatch.h>
+#include <asm/smp.h>
+
+
+unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(cpuidle_disable);
+
+static int __init powersave_off(char *arg)
+{
+	ppc_md.power_save = NULL;
+	cpuidle_disable = IDLE_POWERSAVE_OFF;
+	return 1;
+}
+__setup("powersave=off", powersave_off);
+
+#ifdef CONFIG_HOTPLUG_CPU
+void arch_cpu_idle_dead(void)
+{
+	sched_preempt_enable_no_resched();
+	cpu_die();
+}
+#endif
+
+void arch_cpu_idle(void)
+{
+	ppc64_runlatch_off();
+
+	if (ppc_md.power_save) {
+		ppc_md.power_save();
+		/*
+		 * Some power_save functions return with
+		 * interrupts enabled, some don't.
+		 */
+		if (irqs_disabled())
+			local_irq_enable();
+	} else {
+		local_irq_enable();
+		/*
+		 * Go into low thread priority and possibly
+		 * low power mode.
+		 */
+		HMT_low();
+		HMT_very_low();
+	}
+
+	HMT_medium();
+	ppc64_runlatch_on();
+}
+
+int powersave_nap;
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Register the sysctl to set/clear powersave_nap.
+ */
+static struct ctl_table powersave_nap_ctl_table[] = {
+	{
+		.procname	= "powersave-nap",
+		.data		= &powersave_nap,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
+};
+static struct ctl_table powersave_nap_sysctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= powersave_nap_ctl_table,
+	},
+	{}
+};
+
+static int __init
+register_powersave_nap_sysctl(void)
+{
+	register_sysctl_table(powersave_nap_sysctl_root);
+
+	return 0;
+}
+__initcall(register_powersave_nap_sysctl);
+#endif
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
new file mode 100644
index 000000000..75de66acc
--- /dev/null
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -0,0 +1,198 @@
+/*
+ *  This file contains the power_save function for 6xx & 7xxx CPUs
+ *  rewritten in assembler
+ *
+ *  Warning ! This code assumes that if your machine has a 750fx
+ *  it will have PLL 1 set to low speed mode (used during NAP/DOZE).
+ *  if this is not the case some additional changes will have to
+ *  be done to check a runtime var (a bit like powersave-nap)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/feature-fixups.h>
+
+	.text
+
+/*
+ * Init idle, called at early CPU setup time from head.S for each CPU
+ * Make sure no rest of NAP mode remains in HID0, save default
+ * values for some CPU specific registers. Called with r24
+ * containing CPU number and r3 reloc offset
+ */
+_GLOBAL(init_idle_6xx)
+BEGIN_FTR_SECTION
+	mfspr	r4,SPRN_HID0
+	rlwinm	r4,r4,0,10,8	/* Clear NAP */
+	mtspr	SPRN_HID0, r4
+	b	1f
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
+	blr
+1:
+	slwi	r5,r24,2
+	add	r5,r5,r3
+BEGIN_FTR_SECTION
+	mfspr	r4,SPRN_MSSCR0
+	addis	r6,r5, nap_save_msscr0@ha
+	stw	r4,nap_save_msscr0@l(r6)
+END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR)
+BEGIN_FTR_SECTION
+	mfspr	r4,SPRN_HID1
+	addis	r6,r5,nap_save_hid1@ha
+	stw	r4,nap_save_hid1@l(r6)
+END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX)
+	blr
+
+/*
+ * Here is the power_save_6xx function. This could eventually be
+ * split into several functions & changing the function pointer
+ * depending on the various features.
+ */
+_GLOBAL(ppc6xx_idle)
+	/* Check if we can nap or doze, put HID0 mask in r3
+	 */
+	lis	r3, 0
+BEGIN_FTR_SECTION
+	lis	r3,HID0_DOZE@h
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE)
+BEGIN_FTR_SECTION
+	/* We must dynamically check for the NAP feature as it
+	 * can be cleared by CPU init after the fixups are done
+	 */
+	lis	r4,cur_cpu_spec@ha
+	lwz	r4,cur_cpu_spec@l(r4)
+	lwz	r4,CPU_SPEC_FEATURES(r4)
+	andi.	r0,r4,CPU_FTR_CAN_NAP
+	beq	1f
+	/* Now check if user or arch enabled NAP mode */
+	lis	r4,powersave_nap@ha
+	lwz	r4,powersave_nap@l(r4)
+	cmpwi	0,r4,0
+	beq	1f
+	lis	r3,HID0_NAP@h
+1:	
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
+	cmpwi	0,r3,0
+	beqlr
+
+	/* Some pre-nap cleanups needed on some CPUs */
+	andis.	r0,r3,HID0_NAP@h
+	beq	2f
+BEGIN_FTR_SECTION
+	/* Disable L2 prefetch on some 745x and try to ensure
+	 * L2 prefetch engines are idle. As explained by errata
+	 * text, we can't be sure they are, we just hope very hard
+	 * that well be enough (sic !). At least I noticed Apple
+	 * doesn't even bother doing the dcbf's here...
+	 */
+	mfspr	r4,SPRN_MSSCR0
+	rlwinm	r4,r4,0,0,29
+	sync
+	mtspr	SPRN_MSSCR0,r4
+	sync
+	isync
+	lis	r4,KERNELBASE@h
+	dcbf	0,r4
+	dcbf	0,r4
+	dcbf	0,r4
+	dcbf	0,r4
+END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR)
+2:
+BEGIN_FTR_SECTION
+	/* Go to low speed mode on some 750FX */
+	lis	r4,powersave_lowspeed@ha
+	lwz	r4,powersave_lowspeed@l(r4)
+	cmpwi	0,r4,0
+	beq	1f
+	mfspr	r4,SPRN_HID1
+	oris	r4,r4,0x0001
+	mtspr	SPRN_HID1,r4
+1:	
+END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX)
+
+	/* Go to NAP or DOZE now */	
+	mfspr	r4,SPRN_HID0
+	lis	r5,(HID0_NAP|HID0_SLEEP)@h
+BEGIN_FTR_SECTION
+	oris	r5,r5,HID0_DOZE@h
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE)
+	andc	r4,r4,r5
+	or	r4,r4,r3
+BEGIN_FTR_SECTION
+	oris	r4,r4,HID0_DPM@h	/* that should be done once for all  */
+END_FTR_SECTION_IFCLR(CPU_FTR_NO_DPM)
+	mtspr	SPRN_HID0,r4
+BEGIN_FTR_SECTION
+	PPC_DSSALL
+	sync
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	CURRENT_THREAD_INFO(r9, r1)
+	lwz	r8,TI_LOCAL_FLAGS(r9)	/* set napping bit */
+	ori	r8,r8,_TLF_NAPPING	/* so when we take an exception */
+	stw	r8,TI_LOCAL_FLAGS(r9)	/* it will return to our caller */
+	mfmsr	r7
+	ori	r7,r7,MSR_EE
+	oris	r7,r7,MSR_POW@h
+1:	sync
+	mtmsr	r7
+	isync
+	b	1b
+
+/*
+ * Return from NAP/DOZE mode, restore some CPU specific registers,
+ * we are called with DR/IR still off and r2 containing physical
+ * address of current.  R11 points to the exception frame (physical
+ * address).  We have to preserve r10.
+ */
+_GLOBAL(power_save_ppc32_restore)
+	lwz	r9,_LINK(r11)		/* interrupted in ppc6xx_idle: */
+	stw	r9,_NIP(r11)		/* make it do a blr */
+
+#ifdef CONFIG_SMP
+	CURRENT_THREAD_INFO(r12, r11)
+	lwz	r11,TI_CPU(r12)		/* get cpu number * 4 */
+	slwi	r11,r11,2
+#else
+	li	r11,0
+#endif
+	/* Todo make sure all these are in the same page
+	 * and load r11 (@ha part + CPU offset) only once
+	 */
+BEGIN_FTR_SECTION
+	mfspr	r9,SPRN_HID0
+	andis.	r9,r9,HID0_NAP@h
+	beq	1f
+	addis	r9,r11,(nap_save_msscr0-KERNELBASE)@ha
+	lwz	r9,nap_save_msscr0@l(r9)
+	mtspr	SPRN_MSSCR0, r9
+	sync
+	isync
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR)
+BEGIN_FTR_SECTION
+	addis	r9,r11,(nap_save_hid1-KERNELBASE)@ha
+	lwz	r9,nap_save_hid1@l(r9)
+	mtspr	SPRN_HID1, r9
+END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX)
+	b	transfer_to_handler_cont
+
+	.data
+
+_GLOBAL(nap_save_msscr0)
+	.space	4*NR_CPUS
+
+_GLOBAL(nap_save_hid1)
+	.space	4*NR_CPUS
+
+_GLOBAL(powersave_lowspeed)
+	.long	0
diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S
new file mode 100644
index 000000000..4e0d94d02
--- /dev/null
+++ b/arch/powerpc/kernel/idle_book3e.S
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2010 IBM Corp, Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *
+ * Generic idle routine for Book3E processors
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ppc-opcode.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/hw_irq.h>
+
+/* 64-bit version only for now */
+#ifdef CONFIG_PPC64
+
+.macro BOOK3E_IDLE name loop
+_GLOBAL(\name)
+	/* Save LR for later */
+	mflr	r0
+	std	r0,16(r1)
+
+	/* Hard disable interrupts */
+	wrteei	0
+
+	/* Now check if an interrupt came in while we were soft disabled
+	 * since we may otherwise lose it (doorbells etc...).
+	 */
+	lbz	r3,PACAIRQHAPPENED(r13)
+	cmpwi	cr0,r3,0
+	bne	2f
+
+	/* Now we are going to mark ourselves as soft and hard enabled in
+	 * order to be able to take interrupts while asleep. We inform lockdep
+	 * of that. We don't actually turn interrupts on just yet tho.
+	 */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	stdu    r1,-128(r1)
+	bl	trace_hardirqs_on
+	addi    r1,r1,128
+#endif
+	li	r0,IRQS_ENABLED
+	stb	r0,PACAIRQSOFTMASK(r13)
+	
+	/* Interrupts will make use return to LR, so get something we want
+	 * in there
+	 */
+	bl	1f
+
+	/* And return (interrupts are on) */
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+
+1:	/* Let's set the _TLF_NAPPING flag so interrupts make us return
+	 * to the right spot
+	*/
+	CURRENT_THREAD_INFO(r11, r1)
+	ld	r10,TI_LOCAL_FLAGS(r11)
+	ori	r10,r10,_TLF_NAPPING
+	std	r10,TI_LOCAL_FLAGS(r11)
+
+	/* We can now re-enable hard interrupts and go to sleep */
+	wrteei	1
+	\loop
+
+2:
+	lbz	r10,PACAIRQHAPPENED(r13)
+	ori	r10,r10,PACA_IRQ_HARD_DIS
+	stb	r10,PACAIRQHAPPENED(r13)
+	blr
+.endm
+
+.macro BOOK3E_IDLE_LOOP
+1:
+	PPC_WAIT(0)
+	b	1b
+.endm
+
+/* epapr_ev_idle_start below is patched with the proper hcall
+   opcodes during kernel initialization */
+.macro EPAPR_EV_IDLE_LOOP
+idle_loop:
+	LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+	li      r3, -1
+	nop
+	nop
+	nop
+	b       idle_loop
+.endm
+
+BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP
+
+BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP
+
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
new file mode 100644
index 000000000..4a860d3b9
--- /dev/null
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -0,0 +1,995 @@
+/*
+ *  This file contains idle entry/exit functions for POWER7,
+ *  POWER8 and POWER9 CPUs.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ppc-opcode.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/opal.h>
+#include <asm/cpuidle.h>
+#include <asm/exception-64s.h>
+#include <asm/book3s/64/mmu-hash.h>
+#include <asm/mmu.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#undef DEBUG
+
+/*
+ * Use unused space in the interrupt stack to save and restore
+ * registers for winkle support.
+ */
+#define _MMCR0	GPR0
+#define _SDR1	GPR3
+#define _PTCR	GPR3
+#define _RPR	GPR4
+#define _SPURR	GPR5
+#define _PURR	GPR6
+#define _TSCR	GPR7
+#define _DSCR	GPR8
+#define _AMOR	GPR9
+#define _WORT	GPR10
+#define _WORC	GPR11
+#define _LPCR	GPR12
+
+#define PSSCR_EC_ESL_MASK_SHIFTED          (PSSCR_EC | PSSCR_ESL) >> 16
+
+	.text
+
+/*
+ * Used by threads before entering deep idle states. Saves SPRs
+ * in interrupt stack frame
+ */
+save_sprs_to_stack:
+	/*
+	 * Note all register i.e per-core, per-subcore or per-thread is saved
+	 * here since any thread in the core might wake up first
+	 */
+BEGIN_FTR_SECTION
+	/*
+	 * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
+	 * SDR1 here
+	 */
+	mfspr	r3,SPRN_PTCR
+	std	r3,_PTCR(r1)
+	mfspr	r3,SPRN_LPCR
+	std	r3,_LPCR(r1)
+FTR_SECTION_ELSE
+	mfspr	r3,SPRN_SDR1
+	std	r3,_SDR1(r1)
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+	mfspr	r3,SPRN_RPR
+	std	r3,_RPR(r1)
+	mfspr	r3,SPRN_SPURR
+	std	r3,_SPURR(r1)
+	mfspr	r3,SPRN_PURR
+	std	r3,_PURR(r1)
+	mfspr	r3,SPRN_TSCR
+	std	r3,_TSCR(r1)
+	mfspr	r3,SPRN_DSCR
+	std	r3,_DSCR(r1)
+	mfspr	r3,SPRN_AMOR
+	std	r3,_AMOR(r1)
+	mfspr	r3,SPRN_WORT
+	std	r3,_WORT(r1)
+	mfspr	r3,SPRN_WORC
+	std	r3,_WORC(r1)
+/*
+ * On POWER9, there are idle states such as stop4, invoked via cpuidle,
+ * that lose hypervisor resources. In such cases, we need to save
+ * additional SPRs before entering those idle states so that they can
+ * be restored to their older values on wakeup from the idle state.
+ *
+ * On POWER8, the only such deep idle state is winkle which is used
+ * only in the context of CPU-Hotplug, where these additional SPRs are
+ * reinitiazed to a sane value. Hence there is no need to save/restore
+ * these SPRs.
+ */
+BEGIN_FTR_SECTION
+	blr
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
+power9_save_additional_sprs:
+	mfspr	r3, SPRN_PID
+	mfspr	r4, SPRN_LDBAR
+	std	r3, STOP_PID(r13)
+	std	r4, STOP_LDBAR(r13)
+
+	mfspr	r3, SPRN_FSCR
+	mfspr	r4, SPRN_HFSCR
+	std	r3, STOP_FSCR(r13)
+	std	r4, STOP_HFSCR(r13)
+
+	mfspr	r3, SPRN_MMCRA
+	mfspr	r4, SPRN_MMCR0
+	std	r3, STOP_MMCRA(r13)
+	std	r4, _MMCR0(r1)
+
+	mfspr	r3, SPRN_MMCR1
+	mfspr	r4, SPRN_MMCR2
+	std	r3, STOP_MMCR1(r13)
+	std	r4, STOP_MMCR2(r13)
+	blr
+
+power9_restore_additional_sprs:
+	ld	r3,_LPCR(r1)
+	ld	r4, STOP_PID(r13)
+	mtspr	SPRN_LPCR,r3
+	mtspr	SPRN_PID, r4
+
+	ld	r3, STOP_LDBAR(r13)
+	ld	r4, STOP_FSCR(r13)
+	mtspr	SPRN_LDBAR, r3
+	mtspr	SPRN_FSCR, r4
+
+	ld	r3, STOP_HFSCR(r13)
+	ld	r4, STOP_MMCRA(r13)
+	mtspr	SPRN_HFSCR, r3
+	mtspr	SPRN_MMCRA, r4
+
+	ld	r3, _MMCR0(r1)
+	ld	r4, STOP_MMCR1(r13)
+	mtspr	SPRN_MMCR0, r3
+	mtspr	SPRN_MMCR1, r4
+
+	ld	r3, STOP_MMCR2(r13)
+	ld	r4, PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_MMCR2, r3
+	mtspr	SPRN_SPRG3, r4
+	blr
+
+/*
+ * Used by threads when the lock bit of core_idle_state is set.
+ * Threads will spin in HMT_LOW until the lock bit is cleared.
+ * r14 - pointer to core_idle_state
+ * r15 - used to load contents of core_idle_state
+ * r9  - used as a temporary variable
+ */
+
+core_idle_lock_held:
+	HMT_LOW
+3:	lwz	r15,0(r14)
+	andis.	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bne	3b
+	HMT_MEDIUM
+	lwarx	r15,0,r14
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bne-	core_idle_lock_held
+	blr
+
+/* Reuse some unused pt_regs slots for AMR/IAMR/UAMOR/UAMOR */
+#define PNV_POWERSAVE_AMR	_TRAP
+#define PNV_POWERSAVE_IAMR	_DAR
+#define PNV_POWERSAVE_UAMOR	_DSISR
+#define PNV_POWERSAVE_AMOR	RESULT
+
+/*
+ * Pass requested state in r3:
+ *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
+ *	   - Requested PSSCR value in POWER9
+ *
+ * Address of idle handler to branch to in realmode in r4
+ */
+pnv_powersave_common:
+	/* Use r3 to pass state nap/sleep/winkle */
+	/* NAP is a state loss, we create a regs frame on the
+	 * stack, fill it up with the state we care about and
+	 * stick a pointer to it in PACAR1. We really only
+	 * need to save PC, some CR bits and the NV GPRs,
+	 * but for now an interrupt frame will do.
+	 */
+	mtctr	r4
+
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-INT_FRAME_SIZE(r1)
+	std	r0,_LINK(r1)
+	std	r0,_NIP(r1)
+
+	/* We haven't lost state ... yet */
+	li	r0,0
+	stb	r0,PACA_NAPSTATELOST(r13)
+
+	/* Continue saving state */
+	SAVE_GPR(2, r1)
+	SAVE_NVGPRS(r1)
+
+BEGIN_FTR_SECTION
+	mfspr	r4, SPRN_AMR
+	mfspr	r5, SPRN_IAMR
+	mfspr	r6, SPRN_UAMOR
+	std	r4, PNV_POWERSAVE_AMR(r1)
+	std	r5, PNV_POWERSAVE_IAMR(r1)
+	std	r6, PNV_POWERSAVE_UAMOR(r1)
+BEGIN_FTR_SECTION_NESTED(42)
+	mfspr	r7, SPRN_AMOR
+	std	r7, PNV_POWERSAVE_AMOR(r1)
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	mfcr	r5
+	std	r5,_CCR(r1)
+	std	r1,PACAR1(r13)
+
+BEGIN_FTR_SECTION
+	/*
+	 * POWER9 does not require real mode to stop, and presently does not
+	 * set hwthread_state for KVM (threads don't share MMU context), so
+	 * we can remain in virtual mode for this.
+	 */
+	bctr
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+	/*
+	 * POWER8
+	 * Go to real mode to do the nap, as required by the architecture.
+	 * Also, we need to be in real mode before setting hwthread_state,
+	 * because as soon as we do that, another thread can switch
+	 * the MMU context to the guest.
+	 */
+	LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
+	mtmsrd	r7,0
+	bctr
+
+/*
+ * This is the sequence required to execute idle instructions, as
+ * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
+ */
+#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
+	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
+	std	r0,0(r1);					\
+	ptesync;						\
+	ld	r0,0(r1);					\
+236:	cmpd	cr0,r0,r0;					\
+	bne	236b;						\
+	IDLE_INST;
+
+
+	.globl pnv_enter_arch207_idle_mode
+pnv_enter_arch207_idle_mode:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're entering idle */
+	li	r4,KVM_HWTHREAD_IN_IDLE
+	/******************************************************/
+	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
+	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
+	/* MUST occur in real mode, i.e. with the MMU off,    */
+	/* and the MMU must stay off until we clear this flag */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
+	/* pnv_powersave_wakeup in this file.                 */
+	/* The reason is that another thread can switch the   */
+	/* MMU to a guest context whenever this flag is set   */
+	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
+	/* that would potentially cause this thread to start  */
+	/* executing instructions from guest memory in        */
+	/* hypervisor mode, leading to a host crash or data   */
+	/* corruption, or worse.                              */
+	/******************************************************/
+	stb	r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
+	stb	r3,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	cr3,r3,PNV_THREAD_SLEEP
+	bge	cr3,2f
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
+	/* No return */
+2:
+	/* Sleep or winkle */
+	lbz	r7,PACA_THREAD_MASK(r13)
+	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
+	li	r5,0
+	beq	cr3,3f
+	lis	r5,PNV_CORE_IDLE_WINKLE_COUNT@h
+3:
+lwarx_loop1:
+	lwarx	r15,0,r14
+
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
+
+	add	r15,r15,r5			/* Add if winkle */
+	andc	r15,r15,r7			/* Clear thread bit */
+
+	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
+
+/*
+ * If cr0 = 0, then current thread is the last thread of the core entering
+ * sleep. Last thread needs to execute the hardware bug workaround code if
+ * required by the platform.
+ * Make the workaround call unconditionally here. The below branch call is
+ * patched out when the idle states are discovered if the platform does not
+ * require it.
+ */
+.global pnv_fastsleep_workaround_at_entry
+pnv_fastsleep_workaround_at_entry:
+	beq	fastsleep_workaround_at_entry
+
+	stwcx.	r15,0,r14
+	bne-	lwarx_loop1
+	isync
+
+common_enter: /* common code for all the threads entering sleep or winkle */
+	bgt	cr3,enter_winkle
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
+
+fastsleep_workaround_at_entry:
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	stwcx.	r15,0,r14
+	bne-	lwarx_loop1
+	isync
+
+	/* Fast sleep workaround */
+	li	r3,1
+	li	r4,1
+	bl	opal_config_cpu_idle_state
+
+	/* Unlock */
+	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	lwsync
+	stw	r15,0(r14)
+	b	common_enter
+
+enter_winkle:
+	bl	save_sprs_to_stack
+
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
+
+/*
+ * r3 - PSSCR value corresponding to the requested stop state.
+ */
+power_enter_stop:
+/*
+ * Check if we are executing the lite variant with ESL=EC=0
+ */
+	andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
+	clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
+	bne	 .Lhandle_esl_ec_set
+	PPC_STOP
+	li	r3,0  /* Since we didn't lose state, return 0 */
+	std	r3, PACA_REQ_PSSCR(r13)
+
+	/*
+	 * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
+	 * it can determine if the wakeup reason is an HMI in
+	 * CHECK_HMI_INTERRUPT.
+	 *
+	 * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
+	 * reason, so there is no point setting r12 to SRR1.
+	 *
+	 * Further, we clear r12 here, so that we don't accidentally enter the
+	 * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
+	 */
+	li	r12, 0
+	b 	pnv_wakeup_noloss
+
+.Lhandle_esl_ec_set:
+BEGIN_FTR_SECTION
+	/*
+	 * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after
+	 * a state-loss idle. Saving and restoring MMCR0 over idle is a
+	 * workaround.
+	 */
+	mfspr	r4,SPRN_MMCR0
+	std	r4,_MMCR0(r1)
+END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
+
+/*
+ * Check if the requested state is a deep idle state.
+ */
+	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
+	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
+	cmpd	r3,r4
+	bge	.Lhandle_deep_stop
+	PPC_STOP	/* Does not return (system reset interrupt) */
+
+.Lhandle_deep_stop:
+/*
+ * Entering deep idle state.
+ * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
+ * stack and enter stop
+ */
+	lbz     r7,PACA_THREAD_MASK(r13)
+	ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
+
+lwarx_loop_stop:
+	lwarx   r15,0,r14
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
+	andc    r15,r15,r7                      /* Clear thread bit */
+
+	stwcx.  r15,0,r14
+	bne-    lwarx_loop_stop
+	isync
+
+	bl	save_sprs_to_stack
+
+	PPC_STOP	/* Does not return (system reset interrupt) */
+
+/*
+ * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
+ * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE).
+ */
+_GLOBAL(power7_idle_insn)
+	/* Now check if user or arch enabled NAP mode */
+	LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode)
+	b	pnv_powersave_common
+
+#define CHECK_HMI_INTERRUPT						\
+BEGIN_FTR_SECTION_NESTED(66);						\
+	rlwinm	r0,r12,45-31,0xf;  /* extract wake reason field (P8) */	\
+FTR_SECTION_ELSE_NESTED(66);						\
+	rlwinm	r0,r12,45-31,0xe;  /* P7 wake reason field is 3 bits */	\
+ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
+	cmpwi	r0,0xa;			/* Hypervisor maintenance ? */	\
+	bne+	20f;							\
+	/* Invoke opal call to handle hmi */				\
+	ld	r2,PACATOC(r13);					\
+	ld	r1,PACAR1(r13);						\
+	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
+	li	r3,0;			/* NULL argument */		\
+	bl	hmi_exception_realmode;					\
+	nop;								\
+	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
+20:	nop;
+
+/*
+ * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
+ * r3 contains desired PSSCR register value.
+ *
+ * Offline (CPU unplug) case also must notify KVM that the CPU is
+ * idle.
+ */
+_GLOBAL(power9_offline_stop)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/*
+	 * Tell KVM we're entering idle.
+	 * This does not have to be done in real mode because the P9 MMU
+	 * is independent per-thread. Some steppings share radix/hash mode
+	 * between threads, but in that case KVM has a barrier sync in real
+	 * mode before and after switching between radix and hash.
+	 */
+	li	r4,KVM_HWTHREAD_IN_IDLE
+	stb	r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
+	/* fall through */
+
+_GLOBAL(power9_idle_stop)
+	std	r3, PACA_REQ_PSSCR(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+BEGIN_FTR_SECTION
+	sync
+	lwz	r5, PACA_DONT_STOP(r13)
+	cmpwi	r5, 0
+	bne	1f
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
+#endif
+	mtspr 	SPRN_PSSCR,r3
+	LOAD_REG_ADDR(r4,power_enter_stop)
+	b	pnv_powersave_common
+	/* No return */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+1:
+	/*
+	 * We get here when TM / thread reconfiguration bug workaround
+	 * code wants to get the CPU into SMT4 mode, and therefore
+	 * we are being asked not to stop.
+	 */
+	li	r3, 0
+	std	r3, PACA_REQ_PSSCR(r13)
+	blr		/* return 0 for wakeup cause / SRR1 value */
+#endif
+
+/*
+ * Called from machine check handler for powersave wakeups.
+ * Low level machine check processing has already been done. Now just
+ * go through the wake up path to get everything in order.
+ *
+ * r3 - The original SRR1 value.
+ * Original SRR[01] have been clobbered.
+ * MSR_RI is clear.
+ */
+.global pnv_powersave_wakeup_mce
+pnv_powersave_wakeup_mce:
+	/* Set cr3 for pnv_powersave_wakeup */
+	rlwinm	r11,r3,47-31,30,31
+	cmpwi	cr3,r11,2
+
+	/*
+	 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
+	 * reason into r12, which allows reuse of the system reset wakeup
+	 * code without being mistaken for another type of wakeup.
+	 */
+	oris	r12,r3,SRR1_WAKEMCE_RESVD@h
+
+	b	pnv_powersave_wakeup
+
+/*
+ * Called from reset vector for powersave wakeups.
+ * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ * r12 - SRR1
+ */
+.global pnv_powersave_wakeup
+pnv_powersave_wakeup:
+	ld	r2, PACATOC(r13)
+
+BEGIN_FTR_SECTION
+	bl	pnv_restore_hyp_resource_arch300
+FTR_SECTION_ELSE
+	bl	pnv_restore_hyp_resource_arch207
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+
+	li	r0,PNV_THREAD_RUNNING
+	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
+
+	mr	r3,r12
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	lbz	r0,HSTATE_HWTHREAD_STATE(r13)
+	cmpwi	r0,KVM_HWTHREAD_IN_KERNEL
+	beq	0f
+	li	r0,KVM_HWTHREAD_IN_KERNEL
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+0:	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r0,0
+	beq	1f
+	b	kvm_start_guest
+1:
+#endif
+
+	/* Return SRR1 from power7_nap() */
+	blt	cr3,pnv_wakeup_noloss
+	b	pnv_wakeup_loss
+
+/*
+ * Check whether we have woken up with hypervisor state loss.
+ * If yes, restore hypervisor state and return back to link.
+ *
+ * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ */
+pnv_restore_hyp_resource_arch300:
+	/*
+	 * Workaround for POWER9, if we lost resources, the ERAT
+	 * might have been mixed up and needs flushing. We also need
+	 * to reload MMCR0 (see comment above). We also need to set
+	 * then clear bit 60 in MMCRA to ensure the PMU starts running.
+	 */
+	blt	cr3,1f
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+	ld	r1,PACAR1(r13)
+	ld	r4,_MMCR0(r1)
+	mtspr	SPRN_MMCR0,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
+	mfspr	r4,SPRN_MMCRA
+	ori	r4,r4,(1 << (63-60))
+	mtspr	SPRN_MMCRA,r4
+	xori	r4,r4,(1 << (63-60))
+	mtspr	SPRN_MMCRA,r4
+1:
+	/*
+	 * POWER ISA 3. Use PSSCR to determine if we
+	 * are waking up from deep idle state
+	 */
+	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
+	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
+
+	/*
+	 * 0-3 bits correspond to Power-Saving Level Status
+	 * which indicates the idle state we are waking up from
+	 */
+	mfspr	r5, SPRN_PSSCR
+	rldicl  r5,r5,4,60
+	li	r0, 0		/* clear requested_psscr to say we're awake */
+	std	r0, PACA_REQ_PSSCR(r13)
+	cmpd	cr4,r5,r4
+	bge	cr4,pnv_wakeup_tb_loss /* returns to caller */
+
+	blr	/* Waking up without hypervisor state loss. */
+
+/* Same calling convention as arch300 */
+pnv_restore_hyp_resource_arch207:
+	/*
+	 * POWER ISA 2.07 or less.
+	 * Check if we slept with sleep or winkle.
+	 */
+	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	cr2,r4,PNV_THREAD_NAP
+	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
+
+	/*
+	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
+	 * up from nap. At this stage CR3 shouldn't contains 'gt' since that
+	 * indicates we are waking with hypervisor state loss from nap.
+	 */
+	bgt	cr3,.
+
+	blr	/* Waking up without hypervisor state loss */
+
+/*
+ * Called if waking up from idle state which can cause either partial or
+ * complete hyp state loss.
+ * In POWER8, called if waking up from fastsleep or winkle
+ * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
+ *
+ * r13 - PACA
+ * cr3 - gt if waking up with partial/complete hypervisor state loss
+ *
+ * If ISA300:
+ * cr4 - gt or eq if waking up from complete hypervisor state loss.
+ *
+ * If ISA207:
+ * r4 - PACA_THREAD_IDLE_STATE
+ */
+pnv_wakeup_tb_loss:
+	ld	r1,PACAR1(r13)
+	/*
+	 * Before entering any idle state, the NVGPRs are saved in the stack.
+	 * If there was a state loss, or PACA_NAPSTATELOST was set, then the
+	 * NVGPRs are restored. If we are here, it is likely that state is lost,
+	 * but not guaranteed -- neither ISA207 nor ISA300 tests to reach
+	 * here are the same as the test to restore NVGPRS:
+	 * PACA_THREAD_IDLE_STATE test for ISA207, PSSCR test for ISA300,
+	 * and SRR1 test for restoring NVGPRs.
+	 *
+	 * We are about to clobber NVGPRs now, so set NAPSTATELOST to
+	 * guarantee they will always be restored. This might be tightened
+	 * with careful reading of specs (particularly for ISA300) but this
+	 * is already a slow wakeup path and it's simpler to be safe.
+	 */
+	li	r0,1
+	stb	r0,PACA_NAPSTATELOST(r13)
+
+	/*
+	 *
+	 * Save SRR1 and LR in NVGPRs as they might be clobbered in
+	 * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
+	 * to determine the wakeup reason if we branch to kvm_start_guest. LR
+	 * is required to return back to reset vector after hypervisor state
+	 * restore is complete.
+	 */
+	mr	r19,r12
+	mr	r18,r4
+	mflr	r17
+BEGIN_FTR_SECTION
+	CHECK_HMI_INTERRUPT
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+
+	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
+	lbz	r7,PACA_THREAD_MASK(r13)
+
+	/*
+	 * Take the core lock to synchronize against other threads.
+	 *
+	 * Lock bit is set in one of the 2 cases-
+	 * a. In the sleep/winkle enter path, the last thread is executing
+	 * fastsleep workaround code.
+	 * b. In the wake up path, another thread is executing fastsleep
+	 * workaround undo code or resyncing timebase or restoring context
+	 * In either case loop until the lock bit is cleared.
+	 */
+1:
+	lwarx	r15,0,r14
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	stwcx.	r15,0,r14
+	bne-	1b
+	isync
+
+	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
+	cmpwi	cr2,r9,0
+
+	/*
+	 * At this stage
+	 * cr2 - eq if first thread to wakeup in core
+	 * cr3-  gt if waking up with partial/complete hypervisor state loss
+	 * ISA300:
+	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
+	 */
+
+BEGIN_FTR_SECTION
+	/*
+	 * Were we in winkle?
+	 * If yes, check if all threads were in winkle, decrement our
+	 * winkle count, set all thread winkle bits if all were in winkle.
+	 * Check if our thread has a winkle bit set, and set cr4 accordingly
+	 * (to match ISA300, above). Pseudo-code for core idle state
+	 * transitions for ISA207 is as follows (everything happens atomically
+	 * due to store conditional and/or lock bit):
+	 *
+	 * nap_idle() { }
+	 * nap_wake() { }
+	 *
+	 * sleep_idle()
+	 * {
+	 *	core_idle_state &= ~thread_in_core
+	 * }
+	 *
+	 * sleep_wake()
+	 * {
+	 *     bool first_in_core, first_in_subcore;
+	 *
+	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+	 *
+	 *     core_idle_state |= thread_in_core;
+	 * }
+	 *
+	 * winkle_idle()
+	 * {
+	 *	core_idle_state &= ~thread_in_core;
+	 *	core_idle_state += 1 << WINKLE_COUNT_SHIFT;
+	 * }
+	 *
+	 * winkle_wake()
+	 * {
+	 *     bool first_in_core, first_in_subcore, winkle_state_lost;
+	 *
+	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+	 *
+	 *     core_idle_state |= thread_in_core;
+	 *
+	 *     if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
+	 *         core_idle_state |= THREAD_WINKLE_BITS;
+	 *     core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
+	 *
+	 *     winkle_state_lost = core_idle_state &
+	 *				(thread_in_core << WINKLE_THREAD_SHIFT);
+	 *     core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
+	 * }
+	 *
+	 */
+	cmpwi	r18,PNV_THREAD_WINKLE
+	bne	2f
+	andis.	r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
+	subis	r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
+	beq	2f
+	ori	r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
+2:
+	/* Shift thread bit to winkle mask, then test if this thread is set,
+	 * and remove it from the winkle bits */
+	slwi	r8,r7,8
+	and	r8,r8,r15
+	andc	r15,r15,r8
+	cmpwi	cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
+
+	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
+	and	r4,r4,r15
+	cmpwi	r4,0	/* Check if first in subcore */
+
+	or	r15,r15,r7		/* Set thread bit */
+	beq	first_thread_in_subcore
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
+	or	r15,r15,r7		/* Set thread bit */
+	beq	cr2,first_thread_in_core
+
+	/* Not first thread in core or subcore to wake up */
+	b	clear_lock
+
+first_thread_in_subcore:
+	/*
+	 * If waking up from sleep, subcore state is not lost. Hence
+	 * skip subcore state restore
+	 */
+	blt	cr4,subcore_state_restored
+
+	/* Restore per-subcore state */
+	ld      r4,_SDR1(r1)
+	mtspr   SPRN_SDR1,r4
+
+	ld      r4,_RPR(r1)
+	mtspr   SPRN_RPR,r4
+	ld	r4,_AMOR(r1)
+	mtspr	SPRN_AMOR,r4
+
+subcore_state_restored:
+	/*
+	 * Check if the thread is also the first thread in the core. If not,
+	 * skip to clear_lock.
+	 */
+	bne	cr2,clear_lock
+
+first_thread_in_core:
+
+	/*
+	 * First thread in the core waking up from any state which can cause
+	 * partial or complete hypervisor state loss. It needs to
+	 * call the fastsleep workaround code if the platform requires it.
+	 * Call it unconditionally here. The below branch instruction will
+	 * be patched out if the platform does not have fastsleep or does not
+	 * require the workaround. Patching will be performed during the
+	 * discovery of idle-states.
+	 */
+.global pnv_fastsleep_workaround_at_exit
+pnv_fastsleep_workaround_at_exit:
+	b	fastsleep_workaround_at_exit
+
+timebase_resync:
+	/*
+	 * Use cr3 which indicates that we are waking up with atleast partial
+	 * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
+	 */
+	ble	cr3,.Ltb_resynced
+	/* Time base re-sync */
+	bl	opal_resync_timebase;
+	/*
+	 * If waking up from sleep (POWER8), per core state
+	 * is not lost, skip to clear_lock.
+	 */
+.Ltb_resynced:
+	blt	cr4,clear_lock
+
+	/*
+	 * First thread in the core to wake up and its waking up with
+	 * complete hypervisor state loss. Restore per core hypervisor
+	 * state.
+	 */
+BEGIN_FTR_SECTION
+	ld	r4,_PTCR(r1)
+	mtspr	SPRN_PTCR,r4
+	ld	r4,_RPR(r1)
+	mtspr	SPRN_RPR,r4
+	ld	r4,_AMOR(r1)
+	mtspr	SPRN_AMOR,r4
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
+	ld	r4,_TSCR(r1)
+	mtspr	SPRN_TSCR,r4
+	ld	r4,_WORC(r1)
+	mtspr	SPRN_WORC,r4
+
+clear_lock:
+	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	lwsync
+	stw	r15,0(r14)
+
+common_exit:
+	/*
+	 * Common to all threads.
+	 *
+	 * If waking up from sleep, hypervisor state is not lost. Hence
+	 * skip hypervisor state restore.
+	 */
+	blt	cr4,hypervisor_state_restored
+
+	/* Waking up from winkle */
+
+BEGIN_MMU_FTR_SECTION
+	b	no_segments
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
+	/* Restore SLB  from PACA */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	li	r3, SLBSHADOW_SAVEAREA
+	LDX_BE	r5, r8, r3
+	addi	r3, r3, 8
+	LDX_BE	r6, r8, r3
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+no_segments:
+
+	/* Restore per thread state */
+
+	ld	r4,_SPURR(r1)
+	mtspr	SPRN_SPURR,r4
+	ld	r4,_PURR(r1)
+	mtspr	SPRN_PURR,r4
+	ld	r4,_DSCR(r1)
+	mtspr	SPRN_DSCR,r4
+	ld	r4,_WORT(r1)
+	mtspr	SPRN_WORT,r4
+
+	/* Call cur_cpu_spec->cpu_restore() */
+	LOAD_REG_ADDR(r4, cur_cpu_spec)
+	ld	r4,0(r4)
+	ld	r12,CPU_SPEC_RESTORE(r4)
+#ifdef PPC64_ELF_ABI_v1
+	ld	r12,0(r12)
+#endif
+	mtctr	r12
+	bctrl
+
+/*
+ * On POWER9, we can come here on wakeup from a cpuidle stop state.
+ * Hence restore the additional SPRs to the saved value.
+ *
+ * On POWER8, we come here only on winkle. Since winkle is used
+ * only in the case of CPU-Hotplug, we don't need to restore
+ * the additional SPRs.
+ */
+BEGIN_FTR_SECTION
+	bl 	power9_restore_additional_sprs
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+hypervisor_state_restored:
+
+	mr	r12,r19
+	mtlr	r17
+	blr		/* return to pnv_powersave_wakeup */
+
+fastsleep_workaround_at_exit:
+	li	r3,1
+	li	r4,0
+	bl	opal_config_cpu_idle_state
+	b	timebase_resync
+
+/*
+ * R3 here contains the value that will be returned to the caller
+ * of power7_nap.
+ * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
+ */
+.global pnv_wakeup_loss
+pnv_wakeup_loss:
+	ld	r1,PACAR1(r13)
+BEGIN_FTR_SECTION
+	CHECK_HMI_INTERRUPT
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+	REST_NVGPRS(r1)
+	REST_GPR(2, r1)
+
+BEGIN_FTR_SECTION
+	/* These regs were saved in pnv_powersave_common() */
+	ld	r4, PNV_POWERSAVE_AMR(r1)
+	ld	r5, PNV_POWERSAVE_IAMR(r1)
+	ld	r6, PNV_POWERSAVE_UAMOR(r1)
+	mtspr	SPRN_AMR, r4
+	mtspr	SPRN_IAMR, r5
+	mtspr	SPRN_UAMOR, r6
+BEGIN_FTR_SECTION_NESTED(42)
+	ld	r7, PNV_POWERSAVE_AMOR(r1)
+	mtspr	SPRN_AMOR, r7
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
+	/*
+	 * We don't need an isync here after restoring IAMR because the upcoming
+	 * mtmsrd is execution synchronizing.
+	 */
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	ld	r4,PACAKMSR(r13)
+	ld	r5,_LINK(r1)
+	ld	r6,_CCR(r1)
+	addi	r1,r1,INT_FRAME_SIZE
+	mtlr	r5
+	mtcr	r6
+	mtmsrd	r4
+	blr
+
+/*
+ * R3 here contains the value that will be returned to the caller
+ * of power7_nap.
+ * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
+ */
+pnv_wakeup_noloss:
+	lbz	r0,PACA_NAPSTATELOST(r13)
+	cmpwi	r0,0
+	bne	pnv_wakeup_loss
+	ld	r1,PACAR1(r13)
+BEGIN_FTR_SECTION
+	CHECK_HMI_INTERRUPT
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+	ld	r4,PACAKMSR(r13)
+	ld	r5,_NIP(r1)
+	ld	r6,_CCR(r1)
+	addi	r1,r1,INT_FRAME_SIZE
+	mtlr	r5
+	mtcr	r6
+	mtmsrd	r4
+	blr
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
new file mode 100644
index 000000000..583e55ac7
--- /dev/null
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Dave Liu <daveliu@freescale.com>
+ * copy from idle_6xx.S and modify for e500 based processor,
+ * implement the power_save function in idle.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/feature-fixups.h>
+
+	.text
+
+_GLOBAL(e500_idle)
+	CURRENT_THREAD_INFO(r3, r1)
+	lwz	r4,TI_LOCAL_FLAGS(r3)	/* set napping bit */
+	ori	r4,r4,_TLF_NAPPING	/* so when we take an exception */
+	stw	r4,TI_LOCAL_FLAGS(r3)	/* it will return to our caller */
+
+#ifdef CONFIG_PPC_E500MC
+	wrteei	1
+1:	wait
+
+	/*
+	 * Guard against spurious wakeups (e.g. from a hypervisor) --
+	 * any real interrupt will cause us to return to LR due to
+	 * _TLF_NAPPING.
+	 */
+	b	1b
+#else
+	/* Check if we can nap or doze, put HID0 mask in r3 */
+	lis	r3,0
+BEGIN_FTR_SECTION
+	lis	r3,HID0_DOZE@h
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE)
+
+BEGIN_FTR_SECTION
+	/* Now check if user enabled NAP mode */
+	lis	r4,powersave_nap@ha
+	lwz	r4,powersave_nap@l(r4)
+	cmpwi	0,r4,0
+	beq	1f
+	stwu	r1,-16(r1)
+	mflr	r0
+	stw	r0,20(r1)
+	bl	flush_dcache_L1
+	lwz	r0,20(r1)
+	addi	r1,r1,16
+	mtlr	r0
+	lis	r3,HID0_NAP@h
+END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
+1:
+	/* Go to NAP or DOZE now */
+	mfspr	r4,SPRN_HID0
+	rlwinm	r4,r4,0,~(HID0_DOZE|HID0_NAP|HID0_SLEEP)
+	or	r4,r4,r3
+	isync
+	mtspr	SPRN_HID0,r4
+	isync
+
+	mfmsr	r7
+	oris	r7,r7,MSR_WE@h
+	ori	r7,r7,MSR_EE
+	msync
+	mtmsr	r7
+	isync
+2:	b	2b
+#endif /* !E500MC */
+
+/*
+ * Return from NAP/DOZE mode, restore some CPU specific registers,
+ * r2 containing physical address of current.
+ * r11 points to the exception frame (physical address).
+ * We have to preserve r10.
+ */
+_GLOBAL(power_save_ppc32_restore)
+	lwz	r9,_LINK(r11)		/* interrupted in e500_idle */
+	stw	r9,_NIP(r11)		/* make it do a blr */
+
+#ifdef CONFIG_SMP
+	CURRENT_THREAD_INFO(r12, r1)
+	lwz	r11,TI_CPU(r12)		/* get cpu number * 4 */
+	slwi	r11,r11,2
+#else
+	li	r11,0
+#endif
+
+	b	transfer_to_handler_cont
diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S
new file mode 100644
index 000000000..a09b3c7ca
--- /dev/null
+++ b/arch/powerpc/kernel/idle_power4.S
@@ -0,0 +1,87 @@
+/*
+ *  This file contains the power_save function for 970-family CPUs.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/irqflags.h>
+#include <asm/hw_irq.h>
+#include <asm/feature-fixups.h>
+
+#undef DEBUG
+
+	.text
+
+_GLOBAL(power4_idle)
+BEGIN_FTR_SECTION
+	blr
+END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
+	/* Now check if user or arch enabled NAP mode */
+	LOAD_REG_ADDRBASE(r3,powersave_nap)
+	lwz	r4,ADDROFF(powersave_nap)(r3)
+	cmpwi	0,r4,0
+	beqlr
+
+	/* This sequence is similar to prep_irq_for_idle() */
+
+	/* Hard disable interrupts */
+	mfmsr	r7
+	rldicl	r0,r7,48,1
+	rotldi	r0,r0,16
+	mtmsrd	r0,1
+
+	/* Check if something happened while soft-disabled */
+	lbz	r0,PACAIRQHAPPENED(r13)
+	cmpwi	cr0,r0,0
+	bne-	2f
+
+	/*
+	 * Soft-enable interrupts. This will make power4_fixup_nap return
+	 * to our caller with interrupts enabled (soft and hard). The caller
+	 * can cope with either interrupts disabled or enabled upon return.
+	 */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* Tell the tracer interrupts are on, because idle responds to them. */
+	mflr	r0
+	std	r0,16(r1)
+	stdu    r1,-128(r1)
+	bl	trace_hardirqs_on
+	addi    r1,r1,128
+	ld	r0,16(r1)
+	mtlr	r0
+	mfmsr	r7
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+	li	r0,IRQS_ENABLED
+	stb	r0,PACAIRQSOFTMASK(r13)	/* we'll hard-enable shortly */
+BEGIN_FTR_SECTION
+	DSSALL
+	sync
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	CURRENT_THREAD_INFO(r9, r1)
+	ld	r8,TI_LOCAL_FLAGS(r9)	/* set napping bit */
+	ori	r8,r8,_TLF_NAPPING	/* so when we take an exception */
+	std	r8,TI_LOCAL_FLAGS(r9)	/* it will return to our caller */
+	ori	r7,r7,MSR_EE
+	oris	r7,r7,MSR_POW@h
+1:	sync
+	isync
+	mtmsrd	r7
+	isync
+	b	1b
+
+2:	/* Return if an interrupt had happened while soft disabled */
+	/* Set the HARD_DIS flag because interrupts are now hard disabled */
+	ori	r0,r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+	blr
diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c
new file mode 100644
index 000000000..5ea42c937
--- /dev/null
+++ b/arch/powerpc/kernel/ima_kexec.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Authors:
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+
+static int get_addr_size_cells(int *addr_cells, int *size_cells)
+{
+	struct device_node *root;
+
+	root = of_find_node_by_path("/");
+	if (!root)
+		return -EINVAL;
+
+	*addr_cells = of_n_addr_cells(root);
+	*size_cells = of_n_size_cells(root);
+
+	of_node_put(root);
+
+	return 0;
+}
+
+static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
+			       size_t *size)
+{
+	int ret, addr_cells, size_cells;
+
+	ret = get_addr_size_cells(&addr_cells, &size_cells);
+	if (ret)
+		return ret;
+
+	if (len < 4 * (addr_cells + size_cells))
+		return -ENOENT;
+
+	*addr = of_read_number(prop, addr_cells);
+	*size = of_read_number(prop + 4 * addr_cells, size_cells);
+
+	return 0;
+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr:	On successful return, set to point to the buffer contents.
+ * @size:	On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_get_kexec_buffer(void **addr, size_t *size)
+{
+	int ret, len;
+	unsigned long tmp_addr;
+	size_t tmp_size;
+	const void *prop;
+
+	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
+	if (!prop)
+		return -ENOENT;
+
+	ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
+	if (ret)
+		return ret;
+
+	*addr = __va(tmp_addr);
+	*size = tmp_size;
+
+	return 0;
+}
+
+/**
+ * ima_free_kexec_buffer - free memory used by the IMA buffer
+ */
+int ima_free_kexec_buffer(void)
+{
+	int ret;
+	unsigned long addr;
+	size_t size;
+	struct property *prop;
+
+	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
+	if (!prop)
+		return -ENOENT;
+
+	ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
+	if (ret)
+		return ret;
+
+	ret = of_remove_property(of_chosen, prop);
+	if (ret)
+		return ret;
+
+	return memblock_free(addr, size);
+
+}
+
+/**
+ * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
+ *
+ * The IMA measurement buffer is of no use to a subsequent kernel, so we always
+ * remove it from the device tree.
+ */
+void remove_ima_buffer(void *fdt, int chosen_node)
+{
+	int ret, len;
+	unsigned long addr;
+	size_t size;
+	const void *prop;
+
+	prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
+	if (!prop)
+		return;
+
+	ret = do_get_kexec_buffer(prop, len, &addr, &size);
+	fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
+	if (ret)
+		return;
+
+	ret = delete_fdt_mem_rsv(fdt, addr, size);
+	if (!ret)
+		pr_debug("Removed old IMA buffer reservation.\n");
+}
+
+#ifdef CONFIG_IMA_KEXEC
+/**
+ * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer
+ *
+ * Architectures should use this function to pass on the IMA buffer
+ * information to the next kernel.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+			      size_t size)
+{
+	image->arch.ima_buffer_addr = load_addr;
+	image->arch.ima_buffer_size = size;
+
+	return 0;
+}
+
+static int write_number(void *p, u64 value, int cells)
+{
+	if (cells == 1) {
+		u32 tmp;
+
+		if (value > U32_MAX)
+			return -EINVAL;
+
+		tmp = cpu_to_be32(value);
+		memcpy(p, &tmp, sizeof(tmp));
+	} else if (cells == 2) {
+		u64 tmp;
+
+		tmp = cpu_to_be64(value);
+		memcpy(p, &tmp, sizeof(tmp));
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * setup_ima_buffer - add IMA buffer information to the fdt
+ * @image:		kexec image being loaded.
+ * @fdt:		Flattened device tree for the next kernel.
+ * @chosen_node:	Offset to the chosen node.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
+{
+	int ret, addr_cells, size_cells, entry_size;
+	u8 value[16];
+
+	remove_ima_buffer(fdt, chosen_node);
+	if (!image->arch.ima_buffer_size)
+		return 0;
+
+	ret = get_addr_size_cells(&addr_cells, &size_cells);
+	if (ret)
+		return ret;
+
+	entry_size = 4 * (addr_cells + size_cells);
+
+	if (entry_size > sizeof(value))
+		return -EINVAL;
+
+	ret = write_number(value, image->arch.ima_buffer_addr, addr_cells);
+	if (ret)
+		return ret;
+
+	ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size,
+			   size_cells);
+	if (ret)
+		return ret;
+
+	ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value,
+			  entry_size);
+	if (ret < 0)
+		return -EINVAL;
+
+	ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr,
+			      image->arch.ima_buffer_size);
+	if (ret)
+		return -EINVAL;
+
+	pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
+		 image->arch.ima_buffer_addr, image->arch.ima_buffer_size);
+
+	return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
new file mode 100644
index 000000000..aa9f1b826
--- /dev/null
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -0,0 +1,213 @@
+/*
+ * Support PCI IO workaround
+ *
+ *  Copyright (C) 2006 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *		       IBM, Corp.
+ *  (C) Copyright 2007-2008 TOSHIBA CORPORATION
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/sched/mm.h>	/* for init_mm */
+
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/ppc-pci.h>
+#include <asm/io-workarounds.h>
+#include <asm/pte-walk.h>
+
+
+#define IOWA_MAX_BUS	8
+
+static struct iowa_bus iowa_busses[IOWA_MAX_BUS];
+static unsigned int iowa_bus_count;
+
+static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
+{
+	int i, j;
+	struct resource *res;
+	unsigned long vstart, vend;
+
+	for (i = 0; i < iowa_bus_count; i++) {
+		struct iowa_bus *bus = &iowa_busses[i];
+		struct pci_controller *phb = bus->phb;
+
+		if (vaddr) {
+			vstart = (unsigned long)phb->io_base_virt;
+			vend = vstart + phb->pci_io_size - 1;
+			if ((vaddr >= vstart) && (vaddr <= vend))
+				return bus;
+		}
+
+		if (paddr)
+			for (j = 0; j < 3; j++) {
+				res = &phb->mem_resources[j];
+				if (paddr >= res->start && paddr <= res->end)
+					return bus;
+			}
+	}
+
+	return NULL;
+}
+
+#ifdef CONFIG_PPC_INDIRECT_MMIO
+struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
+{
+	unsigned hugepage_shift;
+	struct iowa_bus *bus;
+	int token;
+
+	token = PCI_GET_ADDR_TOKEN(addr);
+
+	if (token && token <= iowa_bus_count)
+		bus = &iowa_busses[token - 1];
+	else {
+		unsigned long vaddr, paddr;
+		pte_t *ptep;
+
+		vaddr = (unsigned long)PCI_FIX_ADDR(addr);
+		if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
+			return NULL;
+		/*
+		 * We won't find huge pages here (iomem). Also can't hit
+		 * a page table free due to init_mm
+		 */
+		ptep = find_init_mm_pte(vaddr, &hugepage_shift);
+		if (ptep == NULL)
+			paddr = 0;
+		else {
+			WARN_ON(hugepage_shift);
+			paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+		}
+		bus = iowa_pci_find(vaddr, paddr);
+
+		if (bus == NULL)
+			return NULL;
+	}
+
+	return bus;
+}
+#else /* CONFIG_PPC_INDIRECT_MMIO */
+struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
+{
+	return NULL;
+}
+#endif /* !CONFIG_PPC_INDIRECT_MMIO */
+
+#ifdef CONFIG_PPC_INDIRECT_PIO
+struct iowa_bus *iowa_pio_find_bus(unsigned long port)
+{
+	unsigned long vaddr = (unsigned long)pci_io_base + port;
+	return iowa_pci_find(vaddr, 0);
+}
+#else
+struct iowa_bus *iowa_pio_find_bus(unsigned long port)
+{
+	return NULL;
+}
+#endif
+
+#define DEF_PCI_AC_RET(name, ret, at, al, space, aa)		\
+static ret iowa_##name at					\
+{								\
+	struct iowa_bus *bus;					\
+	bus = iowa_##space##_find_bus(aa);			\
+	if (bus && bus->ops && bus->ops->name)			\
+		return bus->ops->name al;			\
+	return __do_##name al;					\
+}
+
+#define DEF_PCI_AC_NORET(name, at, al, space, aa)		\
+static void iowa_##name at					\
+{								\
+	struct iowa_bus *bus;					\
+	bus = iowa_##space##_find_bus(aa);			\
+	if (bus && bus->ops && bus->ops->name) {		\
+		bus->ops->name al;				\
+		return;						\
+	}							\
+	__do_##name al;						\
+}
+
+#include <asm/io-defs.h>
+
+#undef DEF_PCI_AC_RET
+#undef DEF_PCI_AC_NORET
+
+static const struct ppc_pci_io iowa_pci_io = {
+
+#define DEF_PCI_AC_RET(name, ret, at, al, space, aa)	.name = iowa_##name,
+#define DEF_PCI_AC_NORET(name, at, al, space, aa)	.name = iowa_##name,
+
+#include <asm/io-defs.h>
+
+#undef DEF_PCI_AC_RET
+#undef DEF_PCI_AC_NORET
+
+};
+
+#ifdef CONFIG_PPC_INDIRECT_MMIO
+static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
+				  unsigned long flags, void *caller)
+{
+	struct iowa_bus *bus;
+	void __iomem *res = __ioremap_caller(addr, size, flags, caller);
+	int busno;
+
+	bus = iowa_pci_find(0, (unsigned long)addr);
+	if (bus != NULL) {
+		busno = bus - iowa_busses;
+		PCI_SET_ADDR_TOKEN(res, busno + 1);
+	}
+	return res;
+}
+#else /* CONFIG_PPC_INDIRECT_MMIO */
+#define iowa_ioremap NULL
+#endif /* !CONFIG_PPC_INDIRECT_MMIO */
+
+/* Enable IO workaround */
+static void io_workaround_init(void)
+{
+	static int io_workaround_inited;
+
+	if (io_workaround_inited)
+		return;
+	ppc_pci_io = iowa_pci_io;
+	ppc_md.ioremap = iowa_ioremap;
+	io_workaround_inited = 1;
+}
+
+/* Register new bus to support workaround */
+void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops,
+		       int (*initfunc)(struct iowa_bus *, void *), void *data)
+{
+	struct iowa_bus *bus;
+	struct device_node *np = phb->dn;
+
+	io_workaround_init();
+
+	if (iowa_bus_count >= IOWA_MAX_BUS) {
+		pr_err("IOWA:Too many pci bridges, "
+		       "workarounds disabled for %pOF\n", np);
+		return;
+	}
+
+	bus = &iowa_busses[iowa_bus_count];
+	bus->phb = phb;
+	bus->ops = ops;
+	bus->private = data;
+
+	if (initfunc)
+		if ((*initfunc)(bus, data))
+			return;
+
+	iowa_bus_count++;
+
+	pr_debug("IOWA:[%d]Add bus, %pOF.\n", iowa_bus_count-1, np);
+}
+
diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c
new file mode 100644
index 000000000..2a2b4aeab
--- /dev/null
+++ b/arch/powerpc/kernel/io.c
@@ -0,0 +1,210 @@
+/*
+ * I/O string operations
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *    Copyright (C) 2006 IBM Corporation
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ *
+ * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com)
+ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com)
+ *
+ * Rewritten in C by Stephen Rothwell.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+
+#include <asm/io.h>
+#include <asm/firmware.h>
+#include <asm/bug.h>
+
+/* See definition in io.h */
+bool isa_io_special;
+
+void _insb(const volatile u8 __iomem *port, void *buf, long count)
+{
+	u8 *tbuf = buf;
+	u8 tmp;
+
+	if (unlikely(count <= 0))
+		return;
+	asm volatile("sync");
+	do {
+		tmp = *port;
+		eieio();
+		*tbuf++ = tmp;
+	} while (--count != 0);
+	asm volatile("twi 0,%0,0; isync" : : "r" (tmp));
+}
+EXPORT_SYMBOL(_insb);
+
+void _outsb(volatile u8 __iomem *port, const void *buf, long count)
+{
+	const u8 *tbuf = buf;
+
+	if (unlikely(count <= 0))
+		return;
+	asm volatile("sync");
+	do {
+		*port = *tbuf++;
+	} while (--count != 0);
+	asm volatile("sync");
+}
+EXPORT_SYMBOL(_outsb);
+
+void _insw_ns(const volatile u16 __iomem *port, void *buf, long count)
+{
+	u16 *tbuf = buf;
+	u16 tmp;
+
+	if (unlikely(count <= 0))
+		return;
+	asm volatile("sync");
+	do {
+		tmp = *port;
+		eieio();
+		*tbuf++ = tmp;
+	} while (--count != 0);
+	asm volatile("twi 0,%0,0; isync" : : "r" (tmp));
+}
+EXPORT_SYMBOL(_insw_ns);
+
+void _outsw_ns(volatile u16 __iomem *port, const void *buf, long count)
+{
+	const u16 *tbuf = buf;
+
+	if (unlikely(count <= 0))
+		return;
+	asm volatile("sync");
+	do {
+		*port = *tbuf++;
+	} while (--count != 0);
+	asm volatile("sync");
+}
+EXPORT_SYMBOL(_outsw_ns);
+
+void _insl_ns(const volatile u32 __iomem *port, void *buf, long count)
+{
+	u32 *tbuf = buf;
+	u32 tmp;
+
+	if (unlikely(count <= 0))
+		return;
+	asm volatile("sync");
+	do {
+		tmp = *port;
+		eieio();
+		*tbuf++ = tmp;
+	} while (--count != 0);
+	asm volatile("twi 0,%0,0; isync" : : "r" (tmp));
+}
+EXPORT_SYMBOL(_insl_ns);
+
+void _outsl_ns(volatile u32 __iomem *port, const void *buf, long count)
+{
+	const u32 *tbuf = buf;
+
+	if (unlikely(count <= 0))
+		return;
+	asm volatile("sync");
+	do {
+		*port = *tbuf++;
+	} while (--count != 0);
+	asm volatile("sync");
+}
+EXPORT_SYMBOL(_outsl_ns);
+
+#define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
+
+notrace void
+_memset_io(volatile void __iomem *addr, int c, unsigned long n)
+{
+	void *p = (void __force *)addr;
+	u32 lc = c;
+	lc |= lc << 8;
+	lc |= lc << 16;
+
+	__asm__ __volatile__ ("sync" : : : "memory");
+	while(n && !IO_CHECK_ALIGN(p, 4)) {
+		*((volatile u8 *)p) = c;
+		p++;
+		n--;
+	}
+	while(n >= 4) {
+		*((volatile u32 *)p) = lc;
+		p += 4;
+		n -= 4;
+	}
+	while(n) {
+		*((volatile u8 *)p) = c;
+		p++;
+		n--;
+	}
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+EXPORT_SYMBOL(_memset_io);
+
+void _memcpy_fromio(void *dest, const volatile void __iomem *src,
+		    unsigned long n)
+{
+	void *vsrc = (void __force *) src;
+
+	__asm__ __volatile__ ("sync" : : : "memory");
+	while(n && (!IO_CHECK_ALIGN(vsrc, 4) || !IO_CHECK_ALIGN(dest, 4))) {
+		*((u8 *)dest) = *((volatile u8 *)vsrc);
+		eieio();
+		vsrc++;
+		dest++;
+		n--;
+	}
+	while(n >= 4) {
+		*((u32 *)dest) = *((volatile u32 *)vsrc);
+		eieio();
+		vsrc += 4;
+		dest += 4;
+		n -= 4;
+	}
+	while(n) {
+		*((u8 *)dest) = *((volatile u8 *)vsrc);
+		eieio();
+		vsrc++;
+		dest++;
+		n--;
+	}
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+EXPORT_SYMBOL(_memcpy_fromio);
+
+void _memcpy_toio(volatile void __iomem *dest, const void *src, unsigned long n)
+{
+	void *vdest = (void __force *) dest;
+
+	__asm__ __volatile__ ("sync" : : : "memory");
+	while(n && (!IO_CHECK_ALIGN(vdest, 4) || !IO_CHECK_ALIGN(src, 4))) {
+		*((volatile u8 *)vdest) = *((u8 *)src);
+		src++;
+		vdest++;
+		n--;
+	}
+	while(n >= 4) {
+		*((volatile u32 *)vdest) = *((volatile u32 *)src);
+		src += 4;
+		vdest += 4;
+		n-=4;
+	}
+	while(n) {
+		*((volatile u8 *)vdest) = *((u8 *)src);
+		src++;
+		vdest++;
+		n--;
+	}
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+EXPORT_SYMBOL(_memcpy_toio);
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
new file mode 100644
index 000000000..5ac84efc6
--- /dev/null
+++ b/arch/powerpc/kernel/iomap.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ppc64 "iomap" interface implementation.
+ *
+ * (C) Copyright 2004 Linus Torvalds
+ */
+#include <linux/pci.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <asm/io.h>
+#include <asm/pci-bridge.h>
+#include <asm/isa-bridge.h>
+
+/*
+ * Here comes the ppc64 implementation of the IOMAP 
+ * interfaces.
+ */
+unsigned int ioread8(void __iomem *addr)
+{
+	return readb(addr);
+}
+unsigned int ioread16(void __iomem *addr)
+{
+	return readw(addr);
+}
+unsigned int ioread16be(void __iomem *addr)
+{
+	return readw_be(addr);
+}
+unsigned int ioread32(void __iomem *addr)
+{
+	return readl(addr);
+}
+unsigned int ioread32be(void __iomem *addr)
+{
+	return readl_be(addr);
+}
+EXPORT_SYMBOL(ioread8);
+EXPORT_SYMBOL(ioread16);
+EXPORT_SYMBOL(ioread16be);
+EXPORT_SYMBOL(ioread32);
+EXPORT_SYMBOL(ioread32be);
+#ifdef __powerpc64__
+u64 ioread64(void __iomem *addr)
+{
+	return readq(addr);
+}
+u64 ioread64_lo_hi(void __iomem *addr)
+{
+	return readq(addr);
+}
+u64 ioread64_hi_lo(void __iomem *addr)
+{
+	return readq(addr);
+}
+u64 ioread64be(void __iomem *addr)
+{
+	return readq_be(addr);
+}
+u64 ioread64be_lo_hi(void __iomem *addr)
+{
+	return readq_be(addr);
+}
+u64 ioread64be_hi_lo(void __iomem *addr)
+{
+	return readq_be(addr);
+}
+EXPORT_SYMBOL(ioread64);
+EXPORT_SYMBOL(ioread64_lo_hi);
+EXPORT_SYMBOL(ioread64_hi_lo);
+EXPORT_SYMBOL(ioread64be);
+EXPORT_SYMBOL(ioread64be_lo_hi);
+EXPORT_SYMBOL(ioread64be_hi_lo);
+#endif /* __powerpc64__ */
+
+void iowrite8(u8 val, void __iomem *addr)
+{
+	writeb(val, addr);
+}
+void iowrite16(u16 val, void __iomem *addr)
+{
+	writew(val, addr);
+}
+void iowrite16be(u16 val, void __iomem *addr)
+{
+	writew_be(val, addr);
+}
+void iowrite32(u32 val, void __iomem *addr)
+{
+	writel(val, addr);
+}
+void iowrite32be(u32 val, void __iomem *addr)
+{
+	writel_be(val, addr);
+}
+EXPORT_SYMBOL(iowrite8);
+EXPORT_SYMBOL(iowrite16);
+EXPORT_SYMBOL(iowrite16be);
+EXPORT_SYMBOL(iowrite32);
+EXPORT_SYMBOL(iowrite32be);
+#ifdef __powerpc64__
+void iowrite64(u64 val, void __iomem *addr)
+{
+	writeq(val, addr);
+}
+void iowrite64_lo_hi(u64 val, void __iomem *addr)
+{
+	writeq(val, addr);
+}
+void iowrite64_hi_lo(u64 val, void __iomem *addr)
+{
+	writeq(val, addr);
+}
+void iowrite64be(u64 val, void __iomem *addr)
+{
+	writeq_be(val, addr);
+}
+void iowrite64be_lo_hi(u64 val, void __iomem *addr)
+{
+	writeq_be(val, addr);
+}
+void iowrite64be_hi_lo(u64 val, void __iomem *addr)
+{
+	writeq_be(val, addr);
+}
+EXPORT_SYMBOL(iowrite64);
+EXPORT_SYMBOL(iowrite64_lo_hi);
+EXPORT_SYMBOL(iowrite64_hi_lo);
+EXPORT_SYMBOL(iowrite64be);
+EXPORT_SYMBOL(iowrite64be_lo_hi);
+EXPORT_SYMBOL(iowrite64be_hi_lo);
+#endif /* __powerpc64__ */
+
+/*
+ * These are the "repeat read/write" functions. Note the
+ * non-CPU byte order. We do things in "IO byteorder"
+ * here.
+ *
+ * FIXME! We could make these do EEH handling if we really
+ * wanted. Not clear if we do.
+ */
+void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	readsb(addr, dst, count);
+}
+void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	readsw(addr, dst, count);
+}
+void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
+{
+	readsl(addr, dst, count);
+}
+EXPORT_SYMBOL(ioread8_rep);
+EXPORT_SYMBOL(ioread16_rep);
+EXPORT_SYMBOL(ioread32_rep);
+
+void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	writesb(addr, src, count);
+}
+void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	writesw(addr, src, count);
+}
+void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
+{
+	writesl(addr, src, count);
+}
+EXPORT_SYMBOL(iowrite8_rep);
+EXPORT_SYMBOL(iowrite16_rep);
+EXPORT_SYMBOL(iowrite32_rep);
+
+void __iomem *ioport_map(unsigned long port, unsigned int len)
+{
+	return (void __iomem *) (port + _IO_BASE);
+}
+
+void ioport_unmap(void __iomem *addr)
+{
+	/* Nothing to do */
+}
+EXPORT_SYMBOL(ioport_map);
+EXPORT_SYMBOL(ioport_unmap);
+
+#ifdef CONFIG_PCI
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	if (isa_vaddr_is_ioport(addr))
+		return;
+	if (pcibios_vaddr_is_ioport(addr))
+		return;
+	iounmap(addr);
+}
+
+EXPORT_SYMBOL(pci_iounmap);
+#endif /* CONFIG_PCI */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
new file mode 100644
index 000000000..c3d2d5cd7
--- /dev/null
+++ b/arch/powerpc/kernel/iommu.c
@@ -0,0 +1,1168 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ * 
+ * Rewrite, cleanup, new allocation schemes, virtual merging: 
+ * Copyright (C) 2004 Olof Johansson, IBM Corporation
+ *               and  Ben. Herrenschmidt, IBM Corporation
+ *
+ * Dynamic DMA mapping support, bus-independent parts.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/bitmap.h>
+#include <linux/iommu-helper.h>
+#include <linux/crash_dump.h>
+#include <linux/hash.h>
+#include <linux/fault-inject.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/sched.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/kdump.h>
+#include <asm/fadump.h>
+#include <asm/vio.h>
+#include <asm/tce.h>
+
+#define DBG(...)
+
+static int novmerge;
+
+static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
+
+static int __init setup_iommu(char *str)
+{
+	if (!strcmp(str, "novmerge"))
+		novmerge = 1;
+	else if (!strcmp(str, "vmerge"))
+		novmerge = 0;
+	return 1;
+}
+
+__setup("iommu=", setup_iommu);
+
+static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+/*
+ * We precalculate the hash to avoid doing it on every allocation.
+ *
+ * The hash is important to spread CPUs across all the pools. For example,
+ * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
+ * with 4 pools all primary threads would map to the same pool.
+ */
+static int __init setup_iommu_pool_hash(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+
+	return 0;
+}
+subsys_initcall(setup_iommu_pool_hash);
+
+#ifdef CONFIG_FAIL_IOMMU
+
+static DECLARE_FAULT_ATTR(fail_iommu);
+
+static int __init setup_fail_iommu(char *str)
+{
+	return setup_fault_attr(&fail_iommu, str);
+}
+__setup("fail_iommu=", setup_fail_iommu);
+
+static bool should_fail_iommu(struct device *dev)
+{
+	return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
+}
+
+static int __init fail_iommu_debugfs(void)
+{
+	struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
+						       NULL, &fail_iommu);
+
+	return PTR_ERR_OR_ZERO(dir);
+}
+late_initcall(fail_iommu_debugfs);
+
+static ssize_t fail_iommu_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
+}
+
+static ssize_t fail_iommu_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	int i;
+
+	if (count > 0 && sscanf(buf, "%d", &i) > 0)
+		dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(fail_iommu);
+
+static int fail_iommu_bus_notify(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		if (device_create_file(dev, &dev_attr_fail_iommu))
+			pr_warn("Unable to create IOMMU fault injection sysfs "
+				"entries\n");
+	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
+		device_remove_file(dev, &dev_attr_fail_iommu);
+	}
+
+	return 0;
+}
+
+static struct notifier_block fail_iommu_bus_notifier = {
+	.notifier_call = fail_iommu_bus_notify
+};
+
+static int __init fail_iommu_setup(void)
+{
+#ifdef CONFIG_PCI
+	bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
+#endif
+#ifdef CONFIG_IBMVIO
+	bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
+#endif
+
+	return 0;
+}
+/*
+ * Must execute after PCI and VIO subsystem have initialised but before
+ * devices are probed.
+ */
+arch_initcall(fail_iommu_setup);
+#else
+static inline bool should_fail_iommu(struct device *dev)
+{
+	return false;
+}
+#endif
+
+static unsigned long iommu_range_alloc(struct device *dev,
+				       struct iommu_table *tbl,
+                                       unsigned long npages,
+                                       unsigned long *handle,
+                                       unsigned long mask,
+                                       unsigned int align_order)
+{ 
+	unsigned long n, end, start;
+	unsigned long limit;
+	int largealloc = npages > 15;
+	int pass = 0;
+	unsigned long align_mask;
+	unsigned long boundary_size;
+	unsigned long flags;
+	unsigned int pool_nr;
+	struct iommu_pool *pool;
+
+	align_mask = (1ull << align_order) - 1;
+
+	/* This allocator was derived from x86_64's bit string search */
+
+	/* Sanity check */
+	if (unlikely(npages == 0)) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+		return IOMMU_MAPPING_ERROR;
+	}
+
+	if (should_fail_iommu(dev))
+		return IOMMU_MAPPING_ERROR;
+
+	/*
+	 * We don't need to disable preemption here because any CPU can
+	 * safely use any IOMMU pool.
+	 */
+	pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
+
+	if (largealloc)
+		pool = &(tbl->large_pool);
+	else
+		pool = &(tbl->pools[pool_nr]);
+
+	spin_lock_irqsave(&(pool->lock), flags);
+
+again:
+	if ((pass == 0) && handle && *handle &&
+	    (*handle >= pool->start) && (*handle < pool->end))
+		start = *handle;
+	else
+		start = pool->hint;
+
+	limit = pool->end;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the initial start.
+	 */
+	if (start >= limit)
+		start = pool->start;
+
+	if (limit + tbl->it_offset > mask) {
+		limit = mask - tbl->it_offset + 1;
+		/* If we're constrained on address range, first try
+		 * at the masked hint to avoid O(n) search complexity,
+		 * but on second pass, start at 0 in pool 0.
+		 */
+		if ((start & mask) >= limit || pass > 0) {
+			spin_unlock(&(pool->lock));
+			pool = &(tbl->pools[0]);
+			spin_lock(&(pool->lock));
+			start = pool->start;
+		} else {
+			start &= mask;
+		}
+	}
+
+	if (dev)
+		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+				      1 << tbl->it_page_shift);
+	else
+		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
+	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
+
+	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
+			     boundary_size >> tbl->it_page_shift, align_mask);
+	if (n == -1) {
+		if (likely(pass == 0)) {
+			/* First try the pool from the start */
+			pool->hint = pool->start;
+			pass++;
+			goto again;
+
+		} else if (pass <= tbl->nr_pools) {
+			/* Now try scanning all the other pools */
+			spin_unlock(&(pool->lock));
+			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
+			pool = &tbl->pools[pool_nr];
+			spin_lock(&(pool->lock));
+			pool->hint = pool->start;
+			pass++;
+			goto again;
+
+		} else {
+			/* Give up */
+			spin_unlock_irqrestore(&(pool->lock), flags);
+			return IOMMU_MAPPING_ERROR;
+		}
+	}
+
+	end = n + npages;
+
+	/* Bump the hint to a new block for small allocs. */
+	if (largealloc) {
+		/* Don't bump to new block to avoid fragmentation */
+		pool->hint = end;
+	} else {
+		/* Overflow will be taken care of at the next allocation */
+		pool->hint = (end + tbl->it_blocksize - 1) &
+		                ~(tbl->it_blocksize - 1);
+	}
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+
+	spin_unlock_irqrestore(&(pool->lock), flags);
+
+	return n;
+}
+
+static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+			      void *page, unsigned int npages,
+			      enum dma_data_direction direction,
+			      unsigned long mask, unsigned int align_order,
+			      unsigned long attrs)
+{
+	unsigned long entry;
+	dma_addr_t ret = IOMMU_MAPPING_ERROR;
+	int build_fail;
+
+	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
+
+	if (unlikely(entry == IOMMU_MAPPING_ERROR))
+		return IOMMU_MAPPING_ERROR;
+
+	entry += tbl->it_offset;	/* Offset into real TCE table */
+	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
+
+	/* Put the TCEs in the HW table */
+	build_fail = tbl->it_ops->set(tbl, entry, npages,
+				      (unsigned long)page &
+				      IOMMU_PAGE_MASK(tbl), direction, attrs);
+
+	/* tbl->it_ops->set() only returns non-zero for transient errors.
+	 * Clean up the table bitmap in this case and return
+	 * IOMMU_MAPPING_ERROR. For all other errors the functionality is
+	 * not altered.
+	 */
+	if (unlikely(build_fail)) {
+		__iommu_free(tbl, ret, npages);
+		return IOMMU_MAPPING_ERROR;
+	}
+
+	/* Flush/invalidate TLB caches if necessary */
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	return ret;
+}
+
+static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
+			     unsigned int npages)
+{
+	unsigned long entry, free_entry;
+
+	entry = dma_addr >> tbl->it_page_shift;
+	free_entry = entry - tbl->it_offset;
+
+	if (((free_entry + npages) > tbl->it_size) ||
+	    (entry < tbl->it_offset)) {
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "iommu_free: invalid entry\n");
+			printk(KERN_INFO "\tentry     = 0x%lx\n", entry); 
+			printk(KERN_INFO "\tdma_addr  = 0x%llx\n", (u64)dma_addr);
+			printk(KERN_INFO "\tTable     = 0x%llx\n", (u64)tbl);
+			printk(KERN_INFO "\tbus#      = 0x%llx\n", (u64)tbl->it_busno);
+			printk(KERN_INFO "\tsize      = 0x%llx\n", (u64)tbl->it_size);
+			printk(KERN_INFO "\tstartOff  = 0x%llx\n", (u64)tbl->it_offset);
+			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
+			WARN_ON(1);
+		}
+
+		return false;
+	}
+
+	return true;
+}
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+
+	/* The large pool is the last pool at the top of the table */
+	if (entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr > tbl->nr_pools);
+		p = &tbl->pools[pool_nr];
+	}
+
+	return p;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+	unsigned long flags;
+	struct iommu_pool *pool;
+
+	entry = dma_addr >> tbl->it_page_shift;
+	free_entry = entry - tbl->it_offset;
+
+	pool = get_pool(tbl, free_entry);
+
+	if (!iommu_free_check(tbl, dma_addr, npages))
+		return;
+
+	tbl->it_ops->clear(tbl, entry, npages);
+
+	spin_lock_irqsave(&(pool->lock), flags);
+	bitmap_clear(tbl->it_map, free_entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+		unsigned int npages)
+{
+	__iommu_free(tbl, dma_addr, npages);
+
+	/* Make sure TLB cache is flushed if the HW needs it. We do
+	 * not do an mb() here on purpose, it is not needed on any of
+	 * the current platforms.
+	 */
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
+}
+
+int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+		     struct scatterlist *sglist, int nelems,
+		     unsigned long mask, enum dma_data_direction direction,
+		     unsigned long attrs)
+{
+	dma_addr_t dma_next = 0, dma_addr;
+	struct scatterlist *s, *outs, *segstart;
+	int outcount, incount, i, build_fail = 0;
+	unsigned int align;
+	unsigned long handle;
+	unsigned int max_seg_size;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if ((nelems == 0) || !tbl)
+		return 0;
+
+	outs = s = segstart = &sglist[0];
+	outcount = 1;
+	incount = nelems;
+	handle = 0;
+
+	/* Init first segment length for backout at failure */
+	outs->dma_length = 0;
+
+	DBG("sg mapping %d elements:\n", nelems);
+
+	max_seg_size = dma_get_max_seg_size(dev);
+	for_each_sg(sglist, s, nelems, i) {
+		unsigned long vaddr, npages, entry, slen;
+
+		slen = s->length;
+		/* Sanity check */
+		if (slen == 0) {
+			dma_next = 0;
+			continue;
+		}
+		/* Allocate iommu entries for that segment */
+		vaddr = (unsigned long) sg_virt(s);
+		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
+		align = 0;
+		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
+		    (vaddr & ~PAGE_MASK) == 0)
+			align = PAGE_SHIFT - tbl->it_page_shift;
+		entry = iommu_range_alloc(dev, tbl, npages, &handle,
+					  mask >> tbl->it_page_shift, align);
+
+		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
+
+		/* Handle failure */
+		if (unlikely(entry == IOMMU_MAPPING_ERROR)) {
+			if (!(attrs & DMA_ATTR_NO_WARN) &&
+			    printk_ratelimit())
+				dev_info(dev, "iommu_alloc failed, tbl %p "
+					 "vaddr %lx npages %lu\n", tbl, vaddr,
+					 npages);
+			goto failure;
+		}
+
+		/* Convert entry to a dma_addr_t */
+		entry += tbl->it_offset;
+		dma_addr = entry << tbl->it_page_shift;
+		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
+
+		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
+			    npages, entry, dma_addr);
+
+		/* Insert into HW table */
+		build_fail = tbl->it_ops->set(tbl, entry, npages,
+					      vaddr & IOMMU_PAGE_MASK(tbl),
+					      direction, attrs);
+		if(unlikely(build_fail))
+			goto failure;
+
+		/* If we are in an open segment, try merging */
+		if (segstart != s) {
+			DBG("  - trying merge...\n");
+			/* We cannot merge if:
+			 * - allocated dma_addr isn't contiguous to previous allocation
+			 */
+			if (novmerge || (dma_addr != dma_next) ||
+			    (outs->dma_length + s->length > max_seg_size)) {
+				/* Can't merge: create a new segment */
+				segstart = s;
+				outcount++;
+				outs = sg_next(outs);
+				DBG("    can't merge, new segment.\n");
+			} else {
+				outs->dma_length += s->length;
+				DBG("    merged, new len: %ux\n", outs->dma_length);
+			}
+		}
+
+		if (segstart == s) {
+			/* This is a new segment, fill entries */
+			DBG("  - filling new segment.\n");
+			outs->dma_address = dma_addr;
+			outs->dma_length = slen;
+		}
+
+		/* Calculate next page pointer for contiguous check */
+		dma_next = dma_addr + slen;
+
+		DBG("  - dma next is: %lx\n", dma_next);
+	}
+
+	/* Flush/invalidate TLB caches if necessary */
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
+
+	DBG("mapped %d elements:\n", outcount);
+
+	/* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
+	 * next entry of the sglist if we didn't fill the list completely
+	 */
+	if (outcount < incount) {
+		outs = sg_next(outs);
+		outs->dma_address = IOMMU_MAPPING_ERROR;
+		outs->dma_length = 0;
+	}
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	return outcount;
+
+ failure:
+	for_each_sg(sglist, s, nelems, i) {
+		if (s->dma_length != 0) {
+			unsigned long vaddr, npages;
+
+			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
+			npages = iommu_num_pages(s->dma_address, s->dma_length,
+						 IOMMU_PAGE_SIZE(tbl));
+			__iommu_free(tbl, vaddr, npages);
+			s->dma_address = IOMMU_MAPPING_ERROR;
+			s->dma_length = 0;
+		}
+		if (s == outs)
+			break;
+	}
+	return 0;
+}
+
+
+void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+			int nelems, enum dma_data_direction direction,
+			unsigned long attrs)
+{
+	struct scatterlist *sg;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if (!tbl)
+		return;
+
+	sg = sglist;
+	while (nelems--) {
+		unsigned int npages;
+		dma_addr_t dma_handle = sg->dma_address;
+
+		if (sg->dma_length == 0)
+			break;
+		npages = iommu_num_pages(dma_handle, sg->dma_length,
+					 IOMMU_PAGE_SIZE(tbl));
+		__iommu_free(tbl, dma_handle, npages);
+		sg = sg_next(sg);
+	}
+
+	/* Flush/invalidate TLBs if necessary. As for iommu_free(), we
+	 * do not do an mb() here, the affected platforms do not need it
+	 * when freeing.
+	 */
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
+}
+
+static void iommu_table_clear(struct iommu_table *tbl)
+{
+	/*
+	 * In case of firmware assisted dump system goes through clean
+	 * reboot process at the time of system crash. Hence it's safe to
+	 * clear the TCE entries if firmware assisted dump is active.
+	 */
+	if (!is_kdump_kernel() || is_fadump_active()) {
+		/* Clear the table in case firmware left allocations in it */
+		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
+		return;
+	}
+
+#ifdef CONFIG_CRASH_DUMP
+	if (tbl->it_ops->get) {
+		unsigned long index, tceval, tcecount = 0;
+
+		/* Reserve the existing mappings left by the first kernel. */
+		for (index = 0; index < tbl->it_size; index++) {
+			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
+			/*
+			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
+			 */
+			if (tceval && (tceval != 0x7fffffffffffffffUL)) {
+				__set_bit(index, tbl->it_map);
+				tcecount++;
+			}
+		}
+
+		if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
+			printk(KERN_WARNING "TCE table is full; freeing ");
+			printk(KERN_WARNING "%d entries for the kdump boot\n",
+				KDUMP_MIN_TCE_ENTRIES);
+			for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
+				index < tbl->it_size; index++)
+				__clear_bit(index, tbl->it_map);
+		}
+	}
+#endif
+}
+
+/*
+ * Build a iommu_table structure.  This contains a bit map which
+ * is used to manage allocation of the tce space.
+ */
+struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
+{
+	unsigned long sz;
+	static int welcomed = 0;
+	struct page *page;
+	unsigned int i;
+	struct iommu_pool *p;
+
+	BUG_ON(!tbl->it_ops);
+
+	/* number of bytes needed for the bitmap */
+	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
+
+	page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
+	if (!page)
+		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
+	tbl->it_map = page_address(page);
+	memset(tbl->it_map, 0, sz);
+
+	/*
+	 * Reserve page 0 so it will not be used for any mappings.
+	 * This avoids buggy drivers that consider page 0 to be invalid
+	 * to crash the machine or even lose data.
+	 */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+
+	/* We only split the IOMMU table if we have 1GB or more of space */
+	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
+		tbl->nr_pools = IOMMU_NR_POOLS;
+	else
+		tbl->nr_pools = 1;
+
+	/* We reserve the top 1/4 of the table for large allocations */
+	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
+
+	for (i = 0; i < tbl->nr_pools; i++) {
+		p = &tbl->pools[i];
+		spin_lock_init(&(p->lock));
+		p->start = tbl->poolsize * i;
+		p->hint = p->start;
+		p->end = p->start + tbl->poolsize;
+	}
+
+	p = &tbl->large_pool;
+	spin_lock_init(&(p->lock));
+	p->start = tbl->poolsize * i;
+	p->hint = p->start;
+	p->end = tbl->it_size;
+
+	iommu_table_clear(tbl);
+
+	if (!welcomed) {
+		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
+		       novmerge ? "disabled" : "enabled");
+		welcomed = 1;
+	}
+
+	return tbl;
+}
+
+static void iommu_table_free(struct kref *kref)
+{
+	unsigned long bitmap_sz;
+	unsigned int order;
+	struct iommu_table *tbl;
+
+	tbl = container_of(kref, struct iommu_table, it_kref);
+
+	if (tbl->it_ops->free)
+		tbl->it_ops->free(tbl);
+
+	if (!tbl->it_map) {
+		kfree(tbl);
+		return;
+	}
+
+	/*
+	 * In case we have reserved the first bit, we should not emit
+	 * the warning below.
+	 */
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	/* verify that table contains no entries */
+	if (!bitmap_empty(tbl->it_map, tbl->it_size))
+		pr_warn("%s: Unexpected TCEs\n", __func__);
+
+	/* calculate bitmap size in bytes */
+	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
+
+	/* free bitmap */
+	order = get_order(bitmap_sz);
+	free_pages((unsigned long) tbl->it_map, order);
+
+	/* free table */
+	kfree(tbl);
+}
+
+struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
+{
+	if (kref_get_unless_zero(&tbl->it_kref))
+		return tbl;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_get);
+
+int iommu_tce_table_put(struct iommu_table *tbl)
+{
+	if (WARN_ON(!tbl))
+		return 0;
+
+	return kref_put(&tbl->it_kref, iommu_table_free);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_put);
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be
+ * contiguous real kernel storage (not vmalloc).  The address passed here
+ * comprises a page address and offset into that page. The dma_addr_t
+ * returned will point to the same byte within the page as was passed in.
+ */
+dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
+			  struct page *page, unsigned long offset, size_t size,
+			  unsigned long mask, enum dma_data_direction direction,
+			  unsigned long attrs)
+{
+	dma_addr_t dma_handle = IOMMU_MAPPING_ERROR;
+	void *vaddr;
+	unsigned long uaddr;
+	unsigned int npages, align;
+
+	BUG_ON(direction == DMA_NONE);
+
+	vaddr = page_address(page) + offset;
+	uaddr = (unsigned long)vaddr;
+
+	if (tbl) {
+		npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
+		align = 0;
+		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
+		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+			align = PAGE_SHIFT - tbl->it_page_shift;
+
+		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
+					 mask >> tbl->it_page_shift, align,
+					 attrs);
+		if (dma_handle == IOMMU_MAPPING_ERROR) {
+			if (!(attrs & DMA_ATTR_NO_WARN) &&
+			    printk_ratelimit())  {
+				dev_info(dev, "iommu_alloc failed, tbl %p "
+					 "vaddr %p npages %d\n", tbl, vaddr,
+					 npages);
+			}
+		} else
+			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
+	}
+
+	return dma_handle;
+}
+
+void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
+		      size_t size, enum dma_data_direction direction,
+		      unsigned long attrs)
+{
+	unsigned int npages;
+
+	BUG_ON(direction == DMA_NONE);
+
+	if (tbl) {
+		npages = iommu_num_pages(dma_handle, size,
+					 IOMMU_PAGE_SIZE(tbl));
+		iommu_free(tbl, dma_handle, npages);
+	}
+}
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
+			   size_t size,	dma_addr_t *dma_handle,
+			   unsigned long mask, gfp_t flag, int node)
+{
+	void *ret = NULL;
+	dma_addr_t mapping;
+	unsigned int order;
+	unsigned int nio_pages, io_order;
+	struct page *page;
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+ 	/*
+	 * Client asked for way too much space.  This is checked later
+	 * anyway.  It is easier to debug here for the drivers than in
+	 * the tce tables.
+	 */
+	if (order >= IOMAP_MAX_ORDER) {
+		dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
+			 size);
+		return NULL;
+	}
+
+	if (!tbl)
+		return NULL;
+
+	/* Alloc enough pages (and possibly more) */
+	page = alloc_pages_node(node, flag, order);
+	if (!page)
+		return NULL;
+	ret = page_address(page);
+	memset(ret, 0, size);
+
+	/* Set up tces to cover the allocated range */
+	nio_pages = size >> tbl->it_page_shift;
+	io_order = get_iommu_order(size, tbl);
+	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
+			      mask >> tbl->it_page_shift, io_order, 0);
+	if (mapping == IOMMU_MAPPING_ERROR) {
+		free_pages((unsigned long)ret, order);
+		return NULL;
+	}
+	*dma_handle = mapping;
+	return ret;
+}
+
+void iommu_free_coherent(struct iommu_table *tbl, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	if (tbl) {
+		unsigned int nio_pages;
+
+		size = PAGE_ALIGN(size);
+		nio_pages = size >> tbl->it_page_shift;
+		iommu_free(tbl, dma_handle, nio_pages);
+		size = PAGE_ALIGN(size);
+		free_pages((unsigned long)vaddr, get_order(size));
+	}
+}
+
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return TCE_PCI_READ | TCE_PCI_WRITE;
+	case DMA_FROM_DEVICE:
+		return TCE_PCI_WRITE;
+	case DMA_TO_DEVICE:
+		return TCE_PCI_READ;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void group_release(void *iommu_data)
+{
+	struct iommu_table_group *table_group = iommu_data;
+
+	table_group->group = NULL;
+}
+
+void iommu_register_group(struct iommu_table_group *table_group,
+		int pci_domain_number, unsigned long pe_num)
+{
+	struct iommu_group *grp;
+	char *name;
+
+	grp = iommu_group_alloc();
+	if (IS_ERR(grp)) {
+		pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
+				PTR_ERR(grp));
+		return;
+	}
+	table_group->group = grp;
+	iommu_group_set_iommudata(grp, table_group, group_release);
+	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
+			pci_domain_number, pe_num);
+	if (!name)
+		return;
+	iommu_group_set_name(grp, name);
+	kfree(name);
+}
+
+enum dma_data_direction iommu_tce_direction(unsigned long tce)
+{
+	if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
+		return DMA_BIDIRECTIONAL;
+	else if (tce & TCE_PCI_READ)
+		return DMA_TO_DEVICE;
+	else if (tce & TCE_PCI_WRITE)
+		return DMA_FROM_DEVICE;
+	else
+		return DMA_NONE;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_direction);
+
+void iommu_flush_tce(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+EXPORT_SYMBOL_GPL(iommu_flush_tce);
+
+int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages)
+{
+	unsigned long mask = (1UL << page_shift) - 1;
+
+	if (ioba & mask)
+		return -EINVAL;
+
+	ioba >>= page_shift;
+	if (ioba < offset)
+		return -EINVAL;
+
+	if ((ioba + 1) > (offset + size))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
+
+int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
+{
+	unsigned long mask = (1UL << page_shift) - 1;
+
+	if (gpa & mask)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
+
+long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret;
+
+	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
+
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL)))
+		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
+
+	/* if (unlikely(ret))
+		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
+			__func__, hwaddr, entry << tbl->it_page_shift,
+				hwaddr, ret); */
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg);
+
+int iommu_take_ownership(struct iommu_table *tbl)
+{
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+	int ret = 0;
+
+	/*
+	 * VFIO does not control TCE entries allocation and the guest
+	 * can write new TCEs on top of existing ones so iommu_tce_build()
+	 * must be able to release old pages. This functionality
+	 * requires exchange() callback defined so if it is not
+	 * implemented, we disallow taking ownership over the table.
+	 */
+	if (!tbl->it_ops->exchange)
+		return -EINVAL;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
+
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+		pr_err("iommu_tce: it_map is not empty");
+		ret = -EBUSY;
+		/* Restore bit#0 set by iommu_init_table() */
+		if (tbl->it_offset == 0)
+			set_bit(0, tbl->it_map);
+	} else {
+		memset(tbl->it_map, 0xff, sz);
+	}
+
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_take_ownership);
+
+void iommu_release_ownership(struct iommu_table *tbl)
+{
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
+
+	memset(tbl->it_map, 0, sz);
+
+	/* Restore bit#0 set by iommu_init_table() */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+}
+EXPORT_SYMBOL_GPL(iommu_release_ownership);
+
+int iommu_add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
+
+	/*
+	 * The sysfs entries should be populated before
+	 * binding IOMMU group. If sysfs entries isn't
+	 * ready, we simply bail.
+	 */
+	if (!device_is_registered(dev))
+		return -ENOENT;
+
+	if (dev->iommu_group) {
+		pr_debug("%s: Skipping device %s with iommu group %d\n",
+			 __func__, dev_name(dev),
+			 iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("%s: Skipping device %s with no tbl\n",
+			 __func__, dev_name(dev));
+		return 0;
+	}
+
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
+	if (!tgl) {
+		pr_debug("%s: Skipping device %s with no group\n",
+			 __func__, dev_name(dev));
+		return 0;
+	}
+	pr_debug("%s: Adding %s to iommu group %d\n",
+		 __func__, dev_name(dev),
+		 iommu_group_id(tgl->table_group->group));
+
+	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
+		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
+		       __func__, IOMMU_PAGE_SIZE(tbl),
+		       PAGE_SIZE, dev_name(dev));
+		return -EINVAL;
+	}
+
+	return iommu_group_add_device(tgl->table_group->group, dev);
+}
+EXPORT_SYMBOL_GPL(iommu_add_device);
+
+void iommu_del_device(struct device *dev)
+{
+	/*
+	 * Some devices might not have IOMMU table and group
+	 * and we needn't detach them from the associated
+	 * IOMMU groups
+	 */
+	if (!dev->iommu_group) {
+		pr_debug("iommu_tce: skipping device %s with no tbl\n",
+			 dev_name(dev));
+		return;
+	}
+
+	iommu_group_remove_device(dev);
+}
+EXPORT_SYMBOL_GPL(iommu_del_device);
+
+static int tce_iommu_bus_notifier(struct notifier_block *nb,
+                unsigned long action, void *data)
+{
+        struct device *dev = data;
+
+        switch (action) {
+        case BUS_NOTIFY_ADD_DEVICE:
+                return iommu_add_device(dev);
+        case BUS_NOTIFY_DEL_DEVICE:
+                if (dev->iommu_group)
+                        iommu_del_device(dev);
+                return 0;
+        default:
+                return 0;
+        }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+        .notifier_call = tce_iommu_bus_notifier,
+};
+
+int __init tce_iommu_bus_notifier_init(void)
+{
+        bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+        return 0;
+}
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
new file mode 100644
index 000000000..d37704ebc
--- /dev/null
+++ b/arch/powerpc/kernel/irq.c
@@ -0,0 +1,843 @@
+/*
+ *  Derived from arch/i386/kernel/irq.c
+ *    Copyright (C) 1992 Linus Torvalds
+ *  Adapted from arch/i386 by Gary Thomas
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Updated and modified by Cort Dougan <cort@fsmlabs.com>
+ *    Copyright (C) 1996-2001 Cort Dougan
+ *  Adapted for Power Macintosh by Paul Mackerras
+ *    Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file contains the code used by various IRQ handling routines:
+ * asking for different IRQ's should be done through these routines
+ * instead of just grabbing them. Thus setups with different IRQ numbers
+ * shouldn't result in any weird surprises, and installing new handlers
+ * should be easier.
+ *
+ * The MPC8xx has an interrupt mask in the SIU.  If a bit is set, the
+ * interrupt is _enabled_.  As expected, IRQ0 is bit 0 in the 32-bit
+ * mask register (of which only 16 are defined), hence the weird shifting
+ * and complement of the cached_irq_mask.  I want to be able to stuff
+ * this right into the SIU SMASK register.
+ * Many of the prep/chrp functions are conditional compiled on CONFIG_PPC_8xx
+ * to reduce code space and undefined function references.
+ */
+
+#undef DEBUG
+
+#include <linux/export.h>
+#include <linux/threads.h>
+#include <linux/kernel_stat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/profile.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/cache.h>
+#include <asm/prom.h>
+#include <asm/ptrace.h>
+#include <asm/machdep.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/livepatch.h>
+#include <asm/asm-prototypes.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#include <asm/firmware.h>
+#include <asm/lv1call.h>
+#endif
+#define CREATE_TRACE_POINTS
+#include <asm/trace.h>
+#include <asm/cpu_has_feature.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+int __irq_offset_value;
+
+#ifdef CONFIG_PPC32
+EXPORT_SYMBOL(__irq_offset_value);
+atomic_t ppc_n_lost_interrupts;
+
+#ifdef CONFIG_TAU_INT
+extern int tau_initialized;
+u32 tau_interrupts(unsigned long cpu);
+#endif
+#endif /* CONFIG_PPC32 */
+
+#ifdef CONFIG_PPC64
+
+int distribute_irqs = 1;
+
+static inline notrace unsigned long get_irq_happened(void)
+{
+	unsigned long happened;
+
+	__asm__ __volatile__("lbz %0,%1(13)"
+	: "=r" (happened) : "i" (offsetof(struct paca_struct, irq_happened)));
+
+	return happened;
+}
+
+static inline notrace int decrementer_check_overflow(void)
+{
+ 	u64 now = get_tb_or_rtc();
+	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+ 
+	return now >= *next_tb;
+}
+
+/* This is called whenever we are re-enabling interrupts
+ * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if
+ * there's an EE, DEC or DBELL to generate.
+ *
+ * This is called in two contexts: From arch_local_irq_restore()
+ * before soft-enabling interrupts, and from the exception exit
+ * path when returning from an interrupt from a soft-disabled to
+ * a soft enabled context. In both case we have interrupts hard
+ * disabled.
+ *
+ * We take care of only clearing the bits we handled in the
+ * PACA irq_happened field since we can only re-emit one at a
+ * time and we don't want to "lose" one.
+ */
+notrace unsigned int __check_irq_replay(void)
+{
+	/*
+	 * We use local_paca rather than get_paca() to avoid all
+	 * the debug_smp_processor_id() business in this low level
+	 * function
+	 */
+	unsigned char happened = local_paca->irq_happened;
+
+	/*
+	 * We are responding to the next interrupt, so interrupt-off
+	 * latencies should be reset here.
+	 */
+	trace_hardirqs_on();
+	trace_hardirqs_off();
+
+	/*
+	 * We are always hard disabled here, but PACA_IRQ_HARD_DIS may
+	 * not be set, which means interrupts have only just been hard
+	 * disabled as part of the local_irq_restore or interrupt return
+	 * code. In that case, skip the decrementr check becaus it's
+	 * expensive to read the TB.
+	 *
+	 * HARD_DIS then gets cleared here, but it's reconciled later.
+	 * Either local_irq_disable will replay the interrupt and that
+	 * will reconcile state like other hard interrupts. Or interrupt
+	 * retur will replay the interrupt and in that case it sets
+	 * PACA_IRQ_HARD_DIS by hand (see comments in entry_64.S).
+	 */
+	if (happened & PACA_IRQ_HARD_DIS) {
+		local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+
+		/*
+		 * We may have missed a decrementer interrupt if hard disabled.
+		 * Check the decrementer register in case we had a rollover
+		 * while hard disabled.
+		 */
+		if (!(happened & PACA_IRQ_DEC)) {
+			if (decrementer_check_overflow()) {
+				local_paca->irq_happened |= PACA_IRQ_DEC;
+				happened |= PACA_IRQ_DEC;
+			}
+		}
+	}
+
+	/*
+	 * Force the delivery of pending soft-disabled interrupts on PS3.
+	 * Any HV call will have this side effect.
+	 */
+	if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+		u64 tmp, tmp2;
+		lv1_get_version_info(&tmp, &tmp2);
+	}
+
+	/*
+	 * Check if an hypervisor Maintenance interrupt happened.
+	 * This is a higher priority interrupt than the others, so
+	 * replay it first.
+	 */
+	if (happened & PACA_IRQ_HMI) {
+		local_paca->irq_happened &= ~PACA_IRQ_HMI;
+		return 0xe60;
+	}
+
+	if (happened & PACA_IRQ_DEC) {
+		local_paca->irq_happened &= ~PACA_IRQ_DEC;
+		return 0x900;
+	}
+
+	if (happened & PACA_IRQ_PMI) {
+		local_paca->irq_happened &= ~PACA_IRQ_PMI;
+		return 0xf00;
+	}
+
+	if (happened & PACA_IRQ_EE) {
+		local_paca->irq_happened &= ~PACA_IRQ_EE;
+		return 0x500;
+	}
+
+#ifdef CONFIG_PPC_BOOK3E
+	/*
+	 * Check if an EPR external interrupt happened this bit is typically
+	 * set if we need to handle another "edge" interrupt from within the
+	 * MPIC "EPR" handler.
+	 */
+	if (happened & PACA_IRQ_EE_EDGE) {
+		local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
+		return 0x500;
+	}
+
+	if (happened & PACA_IRQ_DBELL) {
+		local_paca->irq_happened &= ~PACA_IRQ_DBELL;
+		return 0x280;
+	}
+#else
+	if (happened & PACA_IRQ_DBELL) {
+		local_paca->irq_happened &= ~PACA_IRQ_DBELL;
+		return 0xa00;
+	}
+#endif /* CONFIG_PPC_BOOK3E */
+
+	/* There should be nothing left ! */
+	BUG_ON(local_paca->irq_happened != 0);
+
+	return 0;
+}
+
+notrace void arch_local_irq_restore(unsigned long mask)
+{
+	unsigned char irq_happened;
+	unsigned int replay;
+
+	/* Write the new soft-enabled value */
+	irq_soft_mask_set(mask);
+	if (mask)
+		return;
+
+	/*
+	 * From this point onward, we can take interrupts, preempt,
+	 * etc... unless we got hard-disabled. We check if an event
+	 * happened. If none happened, we know we can just return.
+	 *
+	 * We may have preempted before the check below, in which case
+	 * we are checking the "new" CPU instead of the old one. This
+	 * is only a problem if an event happened on the "old" CPU.
+	 *
+	 * External interrupt events will have caused interrupts to
+	 * be hard-disabled, so there is no problem, we
+	 * cannot have preempted.
+	 */
+	irq_happened = get_irq_happened();
+	if (!irq_happened) {
+		/*
+		 * FIXME. Here we'd like to be able to do:
+		 *
+		 * #ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG
+		 *   WARN_ON(!(mfmsr() & MSR_EE));
+		 * #endif
+		 *
+		 * But currently it hits in a few paths, we should fix those and
+		 * enable the warning.
+		 */
+		return;
+	}
+
+	/*
+	 * We need to hard disable to get a trusted value from
+	 * __check_irq_replay(). We also need to soft-disable
+	 * again to avoid warnings in there due to the use of
+	 * per-cpu variables.
+	 */
+	if (!(irq_happened & PACA_IRQ_HARD_DIS)) {
+#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG
+		WARN_ON(!(mfmsr() & MSR_EE));
+#endif
+		__hard_irq_disable();
+#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG
+	} else {
+		/*
+		 * We should already be hard disabled here. We had bugs
+		 * where that wasn't the case so let's dbl check it and
+		 * warn if we are wrong. Only do that when IRQ tracing
+		 * is enabled as mfmsr() can be costly.
+		 */
+		if (WARN_ON(mfmsr() & MSR_EE))
+			__hard_irq_disable();
+#endif
+	}
+
+	irq_soft_mask_set(IRQS_ALL_DISABLED);
+	trace_hardirqs_off();
+
+	/*
+	 * Check if anything needs to be re-emitted. We haven't
+	 * soft-enabled yet to avoid warnings in decrementer_check_overflow
+	 * accessing per-cpu variables
+	 */
+	replay = __check_irq_replay();
+
+	/* We can soft-enable now */
+	trace_hardirqs_on();
+	irq_soft_mask_set(IRQS_ENABLED);
+
+	/*
+	 * And replay if we have to. This will return with interrupts
+	 * hard-enabled.
+	 */
+	if (replay) {
+		__replay_interrupt(replay);
+		return;
+	}
+
+	/* Finally, let's ensure we are hard enabled */
+	__hard_irq_enable();
+}
+EXPORT_SYMBOL(arch_local_irq_restore);
+
+/*
+ * This is specifically called by assembly code to re-enable interrupts
+ * if they are currently disabled. This is typically called before
+ * schedule() or do_signal() when returning to userspace. We do it
+ * in C to avoid the burden of dealing with lockdep etc...
+ *
+ * NOTE: This is called with interrupts hard disabled but not marked
+ * as such in paca->irq_happened, so we need to resync this.
+ */
+void notrace restore_interrupts(void)
+{
+	if (irqs_disabled()) {
+		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+		local_irq_enable();
+	} else
+		__hard_irq_enable();
+}
+
+/*
+ * This is a helper to use when about to go into idle low-power
+ * when the latter has the side effect of re-enabling interrupts
+ * (such as calling H_CEDE under pHyp).
+ *
+ * You call this function with interrupts soft-disabled (this is
+ * already the case when ppc_md.power_save is called). The function
+ * will return whether to enter power save or just return.
+ *
+ * In the former case, it will have notified lockdep of interrupts
+ * being re-enabled and generally sanitized the lazy irq state,
+ * and in the latter case it will leave with interrupts hard
+ * disabled and marked as such, so the local_irq_enable() call
+ * in arch_cpu_idle() will properly re-enable everything.
+ */
+bool prep_irq_for_idle(void)
+{
+	/*
+	 * First we need to hard disable to ensure no interrupt
+	 * occurs before we effectively enter the low power state
+	 */
+	__hard_irq_disable();
+	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+	/*
+	 * If anything happened while we were soft-disabled,
+	 * we return now and do not enter the low power state.
+	 */
+	if (lazy_irq_pending())
+		return false;
+
+	/* Tell lockdep we are about to re-enable */
+	trace_hardirqs_on();
+
+	/*
+	 * Mark interrupts as soft-enabled and clear the
+	 * PACA_IRQ_HARD_DIS from the pending mask since we
+	 * are about to hard enable as well as a side effect
+	 * of entering the low power state.
+	 */
+	local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+	irq_soft_mask_set(IRQS_ENABLED);
+
+	/* Tell the caller to enter the low power state */
+	return true;
+}
+
+#ifdef CONFIG_PPC_BOOK3S
+/*
+ * This is for idle sequences that return with IRQs off, but the
+ * idle state itself wakes on interrupt. Tell the irq tracer that
+ * IRQs are enabled for the duration of idle so it does not get long
+ * off times. Must be paired with fini_irq_for_idle_irqsoff.
+ */
+bool prep_irq_for_idle_irqsoff(void)
+{
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * First we need to hard disable to ensure no interrupt
+	 * occurs before we effectively enter the low power state
+	 */
+	__hard_irq_disable();
+	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+	/*
+	 * If anything happened while we were soft-disabled,
+	 * we return now and do not enter the low power state.
+	 */
+	if (lazy_irq_pending())
+		return false;
+
+	/* Tell lockdep we are about to re-enable */
+	trace_hardirqs_on();
+
+	return true;
+}
+
+/*
+ * Take the SRR1 wakeup reason, index into this table to find the
+ * appropriate irq_happened bit.
+ *
+ * Sytem reset exceptions taken in idle state also come through here,
+ * but they are NMI interrupts so do not need to wait for IRQs to be
+ * restored, and should be taken as early as practical. These are marked
+ * with 0xff in the table. The Power ISA specifies 0100b as the system
+ * reset interrupt reason.
+ */
+#define IRQ_SYSTEM_RESET	0xff
+
+static const u8 srr1_to_lazyirq[0x10] = {
+	0, 0, 0,
+	PACA_IRQ_DBELL,
+	IRQ_SYSTEM_RESET,
+	PACA_IRQ_DBELL,
+	PACA_IRQ_DEC,
+	0,
+	PACA_IRQ_EE,
+	PACA_IRQ_EE,
+	PACA_IRQ_HMI,
+	0, 0, 0, 0, 0 };
+
+void replay_system_reset(void)
+{
+	struct pt_regs regs;
+
+	ppc_save_regs(&regs);
+	regs.trap = 0x100;
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+EXPORT_SYMBOL_GPL(replay_system_reset);
+
+void irq_set_pending_from_srr1(unsigned long srr1)
+{
+	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
+	u8 reason = srr1_to_lazyirq[idx];
+
+	/*
+	 * Take the system reset now, which is immediately after registers
+	 * are restored from idle. It's an NMI, so interrupts need not be
+	 * re-enabled before it is taken.
+	 */
+	if (unlikely(reason == IRQ_SYSTEM_RESET)) {
+		replay_system_reset();
+		return;
+	}
+
+	/*
+	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
+	 * so this can be called unconditionally with the SRR1 wake
+	 * reason as returned by the idle code, which uses 0 to mean no
+	 * interrupt.
+	 *
+	 * If a future CPU was to designate this as an interrupt reason,
+	 * then a new index for no interrupt must be assigned.
+	 */
+	local_paca->irq_happened |= reason;
+}
+#endif /* CONFIG_PPC_BOOK3S */
+
+/*
+ * Force a replay of the external interrupt handler on this CPU.
+ */
+void force_external_irq_replay(void)
+{
+	/*
+	 * This must only be called with interrupts soft-disabled,
+	 * the replay will happen when re-enabling.
+	 */
+	WARN_ON(!arch_irqs_disabled());
+
+	/*
+	 * Interrupts must always be hard disabled before irq_happened is
+	 * modified (to prevent lost update in case of interrupt between
+	 * load and store).
+	 */
+	__hard_irq_disable();
+	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+	/* Indicate in the PACA that we have an interrupt to replay */
+	local_paca->irq_happened |= PACA_IRQ_EE;
+}
+
+#endif /* CONFIG_PPC64 */
+
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+	int j;
+
+#if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT)
+	if (tau_initialized) {
+		seq_printf(p, "%*s: ", prec, "TAU");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", tau_interrupts(j));
+		seq_puts(p, "  PowerPC             Thermal Assist (cpu temp)\n");
+	}
+#endif /* CONFIG_PPC32 && CONFIG_TAU_INT */
+
+	seq_printf(p, "%*s: ", prec, "LOC");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event);
+        seq_printf(p, "  Local timer interrupts for timer event device\n");
+
+	seq_printf(p, "%*s: ", prec, "BCT");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).broadcast_irqs_event);
+	seq_printf(p, "  Broadcast timer interrupts for timer event device\n");
+
+	seq_printf(p, "%*s: ", prec, "LOC");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others);
+        seq_printf(p, "  Local timer interrupts for others\n");
+
+	seq_printf(p, "%*s: ", prec, "SPU");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs);
+	seq_printf(p, "  Spurious interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "PMI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs);
+	seq_printf(p, "  Performance monitoring interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "MCE");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions);
+	seq_printf(p, "  Machine check exceptions\n");
+
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		seq_printf(p, "%*s: ", prec, "HMI");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+					per_cpu(irq_stat, j).hmi_exceptions);
+		seq_printf(p, "  Hypervisor Maintenance Interrupts\n");
+	}
+
+	seq_printf(p, "%*s: ", prec, "NMI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs);
+	seq_printf(p, "  System Reset interrupts\n");
+
+#ifdef CONFIG_PPC_WATCHDOG
+	seq_printf(p, "%*s: ", prec, "WDG");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs);
+	seq_printf(p, "  Watchdog soft-NMI interrupts\n");
+#endif
+
+#ifdef CONFIG_PPC_DOORBELL
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		seq_printf(p, "%*s: ", prec, "DBL");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs);
+		seq_printf(p, "  Doorbell interrupts\n");
+	}
+#endif
+
+	return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event;
+
+	sum += per_cpu(irq_stat, cpu).broadcast_irqs_event;
+	sum += per_cpu(irq_stat, cpu).pmu_irqs;
+	sum += per_cpu(irq_stat, cpu).mce_exceptions;
+	sum += per_cpu(irq_stat, cpu).spurious_irqs;
+	sum += per_cpu(irq_stat, cpu).timer_irqs_others;
+	sum += per_cpu(irq_stat, cpu).hmi_exceptions;
+	sum += per_cpu(irq_stat, cpu).sreset_irqs;
+#ifdef CONFIG_PPC_WATCHDOG
+	sum += per_cpu(irq_stat, cpu).soft_nmi_irqs;
+#endif
+#ifdef CONFIG_PPC_DOORBELL
+	sum += per_cpu(irq_stat, cpu).doorbell_irqs;
+#endif
+
+	return sum;
+}
+
+static inline void check_stack_overflow(void)
+{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	long sp;
+
+	sp = current_stack_pointer() & (THREAD_SIZE-1);
+
+	/* check for stack overflow: is there less than 2KB free? */
+	if (unlikely(sp < (sizeof(struct thread_info) + 2048))) {
+		pr_err("do_IRQ: stack overflow: %ld\n",
+			sp - sizeof(struct thread_info));
+		dump_stack();
+	}
+#endif
+}
+
+void __do_irq(struct pt_regs *regs)
+{
+	unsigned int irq;
+
+	irq_enter();
+
+	trace_irq_entry(regs);
+
+	/*
+	 * Query the platform PIC for the interrupt & ack it.
+	 *
+	 * This will typically lower the interrupt line to the CPU
+	 */
+	irq = ppc_md.get_irq();
+
+	/* We can hard enable interrupts now to allow perf interrupts */
+	may_hard_irq_enable();
+
+	/* And finally process it */
+	if (unlikely(!irq))
+		__this_cpu_inc(irq_stat.spurious_irqs);
+	else
+		generic_handle_irq(irq);
+
+	trace_irq_exit(regs);
+
+	irq_exit();
+}
+
+void do_IRQ(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct thread_info *curtp, *irqtp, *sirqtp;
+
+	/* Switch to the irq stack to handle this */
+	curtp = current_thread_info();
+	irqtp = hardirq_ctx[raw_smp_processor_id()];
+	sirqtp = softirq_ctx[raw_smp_processor_id()];
+
+	check_stack_overflow();
+
+	/* Already there ? */
+	if (unlikely(curtp == irqtp || curtp == sirqtp)) {
+		__do_irq(regs);
+		set_irq_regs(old_regs);
+		return;
+	}
+
+	/* Prepare the thread_info in the irq stack */
+	irqtp->task = curtp->task;
+	irqtp->flags = 0;
+
+	/* Copy the preempt_count so that the [soft]irq checks work. */
+	irqtp->preempt_count = curtp->preempt_count;
+
+	/* Switch stack and call */
+	call_do_irq(regs, irqtp);
+
+	/* Restore stack limit */
+	irqtp->task = NULL;
+
+	/* Copy back updates to the thread_info */
+	if (irqtp->flags)
+		set_bits(irqtp->flags, &curtp->flags);
+
+	set_irq_regs(old_regs);
+}
+
+void __init init_IRQ(void)
+{
+	if (ppc_md.init_IRQ)
+		ppc_md.init_IRQ();
+
+	exc_lvl_ctx_init();
+
+	irq_ctx_init();
+}
+
+#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+struct thread_info   *critirq_ctx[NR_CPUS] __read_mostly;
+struct thread_info    *dbgirq_ctx[NR_CPUS] __read_mostly;
+struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly;
+
+void exc_lvl_ctx_init(void)
+{
+	struct thread_info *tp;
+	int i, cpu_nr;
+
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_PPC64
+		cpu_nr = i;
+#else
+#ifdef CONFIG_SMP
+		cpu_nr = get_hard_smp_processor_id(i);
+#else
+		cpu_nr = 0;
+#endif
+#endif
+
+		memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
+		tp = critirq_ctx[cpu_nr];
+		tp->cpu = cpu_nr;
+		tp->preempt_count = 0;
+
+#ifdef CONFIG_BOOKE
+		memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE);
+		tp = dbgirq_ctx[cpu_nr];
+		tp->cpu = cpu_nr;
+		tp->preempt_count = 0;
+
+		memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE);
+		tp = mcheckirq_ctx[cpu_nr];
+		tp->cpu = cpu_nr;
+		tp->preempt_count = HARDIRQ_OFFSET;
+#endif
+	}
+}
+#endif
+
+struct thread_info *softirq_ctx[NR_CPUS] __read_mostly;
+struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly;
+
+void irq_ctx_init(void)
+{
+	struct thread_info *tp;
+	int i;
+
+	for_each_possible_cpu(i) {
+		memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
+		tp = softirq_ctx[i];
+		tp->cpu = i;
+		klp_init_thread_info(tp);
+
+		memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
+		tp = hardirq_ctx[i];
+		tp->cpu = i;
+		klp_init_thread_info(tp);
+	}
+}
+
+void do_softirq_own_stack(void)
+{
+	struct thread_info *curtp, *irqtp;
+
+	curtp = current_thread_info();
+	irqtp = softirq_ctx[smp_processor_id()];
+	irqtp->task = curtp->task;
+	irqtp->flags = 0;
+	call_do_softirq(irqtp);
+	irqtp->task = NULL;
+
+	/* Set any flag that may have been set on the
+	 * alternate stack
+	 */
+	if (irqtp->flags)
+		set_bits(irqtp->flags, &curtp->flags);
+}
+
+irq_hw_number_t virq_to_hw(unsigned int virq)
+{
+	struct irq_data *irq_data = irq_get_irq_data(virq);
+	return WARN_ON(!irq_data) ? 0 : irq_data->hwirq;
+}
+EXPORT_SYMBOL_GPL(virq_to_hw);
+
+#ifdef CONFIG_SMP
+int irq_choose_cpu(const struct cpumask *mask)
+{
+	int cpuid;
+
+	if (cpumask_equal(mask, cpu_online_mask)) {
+		static int irq_rover;
+		static DEFINE_RAW_SPINLOCK(irq_rover_lock);
+		unsigned long flags;
+
+		/* Round-robin distribution... */
+do_round_robin:
+		raw_spin_lock_irqsave(&irq_rover_lock, flags);
+
+		irq_rover = cpumask_next(irq_rover, cpu_online_mask);
+		if (irq_rover >= nr_cpu_ids)
+			irq_rover = cpumask_first(cpu_online_mask);
+
+		cpuid = irq_rover;
+
+		raw_spin_unlock_irqrestore(&irq_rover_lock, flags);
+	} else {
+		cpuid = cpumask_first_and(mask, cpu_online_mask);
+		if (cpuid >= nr_cpu_ids)
+			goto do_round_robin;
+	}
+
+	return get_hard_smp_processor_id(cpuid);
+}
+#else
+int irq_choose_cpu(const struct cpumask *mask)
+{
+	return hard_smp_processor_id();
+}
+#endif
+
+int arch_early_irq_init(void)
+{
+	return 0;
+}
+
+#ifdef CONFIG_PPC64
+static int __init setup_noirqdistrib(char *str)
+{
+	distribute_irqs = 0;
+	return 1;
+}
+
+__setup("noirqdistrib", setup_noirqdistrib);
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
new file mode 100644
index 000000000..1df6c74aa
--- /dev/null
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -0,0 +1,358 @@
+/*
+ * Routines for tracking a legacy ISA bridge
+ *
+ * Copyrigh 2007 Benjamin Herrenschmidt <benh@kernel.crashing.org>, IBM Corp.
+ *
+ * Some bits and pieces moved over from pci_64.c
+ *
+ * Copyrigh 2003 Anton Blanchard <anton@au.ibm.com>, IBM Corp.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/isa-bridge.h>
+
+unsigned long isa_io_base;	/* NULL if no ISA bus */
+EXPORT_SYMBOL(isa_io_base);
+
+/* Cached ISA bridge dev. */
+static struct device_node *isa_bridge_devnode;
+struct pci_dev *isa_bridge_pcidev;
+EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
+
+#define ISA_SPACE_MASK 0x1
+#define ISA_SPACE_IO 0x1
+
+static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
+				      unsigned long phb_io_base_phys)
+{
+	/* We should get some saner parsing here and remove these structs */
+	struct pci_address {
+		u32 a_hi;
+		u32 a_mid;
+		u32 a_lo;
+	};
+
+	struct isa_address {
+		u32 a_hi;
+		u32 a_lo;
+	};
+
+	struct isa_range {
+		struct isa_address isa_addr;
+		struct pci_address pci_addr;
+		unsigned int size;
+	};
+
+	const struct isa_range *range;
+	unsigned long pci_addr;
+	unsigned int isa_addr;
+	unsigned int size;
+	int rlen = 0;
+
+	range = of_get_property(isa_node, "ranges", &rlen);
+	if (range == NULL || (rlen < sizeof(struct isa_range)))
+		goto inval_range;
+
+	/* From "ISA Binding to 1275"
+	 * The ranges property is laid out as an array of elements,
+	 * each of which comprises:
+	 *   cells 0 - 1:	an ISA address
+	 *   cells 2 - 4:	a PCI address
+	 *			(size depending on dev->n_addr_cells)
+	 *   cell 5:		the size of the range
+	 */
+	if ((range->isa_addr.a_hi & ISA_SPACE_MASK) != ISA_SPACE_IO) {
+		range++;
+		rlen -= sizeof(struct isa_range);
+		if (rlen < sizeof(struct isa_range))
+			goto inval_range;
+	}
+	if ((range->isa_addr.a_hi & ISA_SPACE_MASK) != ISA_SPACE_IO)
+		goto inval_range;
+
+	isa_addr = range->isa_addr.a_lo;
+	pci_addr = (unsigned long) range->pci_addr.a_mid << 32 |
+		range->pci_addr.a_lo;
+
+	/* Assume these are both zero. Note: We could fix that and
+	 * do a proper parsing instead ... oh well, that will do for
+	 * now as nobody uses fancy mappings for ISA bridges
+	 */
+	if ((pci_addr != 0) || (isa_addr != 0)) {
+		printk(KERN_ERR "unexpected isa to pci mapping: %s\n",
+		       __func__);
+		return;
+	}
+
+	/* Align size and make sure it's cropped to 64K */
+	size = PAGE_ALIGN(range->size);
+	if (size > 0x10000)
+		size = 0x10000;
+
+	__ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
+		     size, pgprot_val(pgprot_noncached(__pgprot(0))));
+	return;
+
+inval_range:
+	printk(KERN_ERR "no ISA IO ranges or unexpected isa range, "
+	       "mapping 64k\n");
+	__ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE,
+		     0x10000, pgprot_val(pgprot_noncached(__pgprot(0))));
+}
+
+
+/**
+ * isa_bridge_find_early - Find and map the ISA IO space early before
+ *                         main PCI discovery. This is optionally called by
+ *                         the arch code when adding PCI PHBs to get early
+ *                         access to ISA IO ports
+ */
+void __init isa_bridge_find_early(struct pci_controller *hose)
+{
+	struct device_node *np, *parent = NULL, *tmp;
+
+	/* If we already have an ISA bridge, bail off */
+	if (isa_bridge_devnode != NULL)
+		return;
+
+	/* For each "isa" node in the system. Note : we do a search by
+	 * type and not by name. It might be better to do by name but that's
+	 * what the code used to do and I don't want to break too much at
+	 * once. We can look into changing that separately
+	 */
+	for_each_node_by_type(np, "isa") {
+		/* Look for our hose being a parent */
+		for (parent = of_get_parent(np); parent;) {
+			if (parent == hose->dn) {
+				of_node_put(parent);
+				break;
+			}
+			tmp = parent;
+			parent = of_get_parent(parent);
+			of_node_put(tmp);
+		}
+		if (parent != NULL)
+			break;
+	}
+	if (np == NULL)
+		return;
+	isa_bridge_devnode = np;
+
+	/* Now parse the "ranges" property and setup the ISA mapping */
+	pci_process_ISA_OF_ranges(np, hose->io_base_phys);
+
+	/* Set the global ISA io base to indicate we have an ISA bridge */
+	isa_io_base = ISA_IO_BASE;
+
+	pr_debug("ISA bridge (early) is %pOF\n", np);
+}
+
+/**
+ * isa_bridge_find_early - Find and map the ISA IO space early before
+ *                         main PCI discovery. This is optionally called by
+ *                         the arch code when adding PCI PHBs to get early
+ *                         access to ISA IO ports
+ */
+void __init isa_bridge_init_non_pci(struct device_node *np)
+{
+	const __be32 *ranges, *pbasep = NULL;
+	int rlen, i, rs;
+	u32 na, ns, pna;
+	u64 cbase, pbase, size = 0;
+
+	/* If we already have an ISA bridge, bail off */
+	if (isa_bridge_devnode != NULL)
+		return;
+
+	pna = of_n_addr_cells(np);
+	if (of_property_read_u32(np, "#address-cells", &na) ||
+	    of_property_read_u32(np, "#size-cells", &ns)) {
+		pr_warn("ISA: Non-PCI bridge %pOF is missing address format\n",
+			np);
+		return;
+	}
+
+	/* Check it's a supported address format */
+	if (na != 2 || ns != 1) {
+		pr_warn("ISA: Non-PCI bridge %pOF has unsupported address format\n",
+			np);
+		return;
+	}
+	rs = na + ns + pna;
+
+	/* Grab the ranges property */
+	ranges = of_get_property(np, "ranges", &rlen);
+	if (ranges == NULL || rlen < rs) {
+		pr_warn("ISA: Non-PCI bridge %pOF has absent or invalid ranges\n",
+			np);
+		return;
+	}
+
+	/* Parse it. We are only looking for IO space */
+	for (i = 0; (i + rs - 1) < rlen; i += rs) {
+		if (be32_to_cpup(ranges + i) != 1)
+			continue;
+		cbase = be32_to_cpup(ranges + i + 1);
+		size = of_read_number(ranges + i + na + pna, ns);
+		pbasep = ranges + i + na;
+		break;
+	}
+
+	/* Got something ? */
+	if (!size || !pbasep) {
+		pr_warn("ISA: Non-PCI bridge %pOF has no usable IO range\n",
+			np);
+		return;
+	}
+
+	/* Align size and make sure it's cropped to 64K */
+	size = PAGE_ALIGN(size);
+	if (size > 0x10000)
+		size = 0x10000;
+
+	/* Map pbase */
+	pbase = of_translate_address(np, pbasep);
+	if (pbase == OF_BAD_ADDR) {
+		pr_warn("ISA: Non-PCI bridge %pOF failed to translate IO base\n",
+			np);
+		return;
+	}
+
+	/* We need page alignment */
+	if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) {
+		pr_warn("ISA: Non-PCI bridge %pOF has non aligned IO range\n",
+			np);
+		return;
+	}
+
+	/* Got it */
+	isa_bridge_devnode = np;
+
+	/* Set the global ISA io base to indicate we have an ISA bridge
+	 * and map it
+	 */
+	isa_io_base = ISA_IO_BASE;
+	__ioremap_at(pbase, (void *)ISA_IO_BASE,
+		     size, pgprot_val(pgprot_noncached(__pgprot(0))));
+
+	pr_debug("ISA: Non-PCI bridge is %pOF\n", np);
+}
+
+/**
+ * isa_bridge_find_late - Find and map the ISA IO space upon discovery of
+ *                        a new ISA bridge
+ */
+static void isa_bridge_find_late(struct pci_dev *pdev,
+				 struct device_node *devnode)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+
+	/* Store ISA device node and PCI device */
+	isa_bridge_devnode = of_node_get(devnode);
+	isa_bridge_pcidev = pdev;
+
+	/* Now parse the "ranges" property and setup the ISA mapping */
+	pci_process_ISA_OF_ranges(devnode, hose->io_base_phys);
+
+	/* Set the global ISA io base to indicate we have an ISA bridge */
+	isa_io_base = ISA_IO_BASE;
+
+	pr_debug("ISA bridge (late) is %pOF on %s\n",
+		 devnode, pci_name(pdev));
+}
+
+/**
+ * isa_bridge_remove - Remove/unmap an ISA bridge
+ */
+static void isa_bridge_remove(void)
+{
+	pr_debug("ISA bridge removed !\n");
+
+	/* Clear the global ISA io base to indicate that we have no more
+	 * ISA bridge. Note that drivers don't quite handle that, though
+	 * we should probably do something about it. But do we ever really
+	 * have ISA bridges being removed on machines using legacy devices ?
+	 */
+	isa_io_base = ISA_IO_BASE;
+
+	/* Clear references to the bridge */
+	of_node_put(isa_bridge_devnode);
+	isa_bridge_devnode = NULL;
+	isa_bridge_pcidev = NULL;
+
+	/* Unmap the ISA area */
+	__iounmap_at((void *)ISA_IO_BASE, 0x10000);
+}
+
+/**
+ * isa_bridge_notify - Get notified of PCI devices addition/removal
+ */
+static int isa_bridge_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct device_node *devnode = pci_device_to_OF_node(pdev);
+
+	switch(action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		/* Check if we have an early ISA device, without PCI dev */
+		if (isa_bridge_devnode && isa_bridge_devnode == devnode &&
+		    !isa_bridge_pcidev) {
+			pr_debug("ISA bridge PCI attached: %s\n",
+				 pci_name(pdev));
+			isa_bridge_pcidev = pdev;
+		}
+
+		/* Check if we have no ISA device, and this happens to be one,
+		 * register it as such if it has an OF device
+		 */
+		if (!isa_bridge_devnode && devnode && devnode->type &&
+		    !strcmp(devnode->type, "isa"))
+			isa_bridge_find_late(pdev, devnode);
+
+		return 0;
+	case BUS_NOTIFY_DEL_DEVICE:
+		/* Check if this our existing ISA device */
+		if (pdev == isa_bridge_pcidev ||
+		    (devnode && devnode == isa_bridge_devnode))
+			isa_bridge_remove();
+		return 0;
+	}
+	return 0;
+}
+
+static struct notifier_block isa_bridge_notifier = {
+	.notifier_call = isa_bridge_notify
+};
+
+/**
+ * isa_bridge_init - register to be notified of ISA bridge addition/removal
+ *
+ */
+static int __init isa_bridge_init(void)
+{
+	bus_register_notifier(&pci_bus_type, &isa_bridge_notifier);
+	return 0;
+}
+arch_initcall(isa_bridge_init);
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
new file mode 100644
index 000000000..0080c5fbd
--- /dev/null
+++ b/arch/powerpc/kernel/jump_label.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2010 Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <asm/code-patching.h>
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	u32 *addr = (u32 *)(unsigned long)entry->code;
+
+	if (type == JUMP_LABEL_JMP)
+		patch_branch(addr, entry->target, 0);
+	else
+		patch_instruction(addr, PPC_INST_NOP);
+}
diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c
new file mode 100644
index 000000000..ba4f18a43
--- /dev/null
+++ b/arch/powerpc/kernel/kexec_elf_64.c
@@ -0,0 +1,664 @@
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004  IBM Corp.
+ * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016  IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation (version 2 of the License).
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt)	"kexec_elf: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/libfdt.h>
+#include <linux/module.h>
+#include <linux/of_fdt.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define PURGATORY_STACK_SIZE	(16 * 1024)
+
+#define elf_addr_to_cpu	elf64_to_cpu
+
+#ifndef Elf_Rel
+#define Elf_Rel		Elf64_Rel
+#endif /* Elf_Rel */
+
+struct elf_info {
+	/*
+	 * Where the ELF binary contents are kept.
+	 * Memory managed by the user of the struct.
+	 */
+	const char *buffer;
+
+	const struct elfhdr *ehdr;
+	const struct elf_phdr *proghdrs;
+	struct elf_shdr *sechdrs;
+};
+
+static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
+{
+       return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
+}
+
+static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
+{
+	if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+		value = le64_to_cpu(value);
+	else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+		value = be64_to_cpu(value);
+
+	return value;
+}
+
+static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
+{
+	if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+		value = le16_to_cpu(value);
+	else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+		value = be16_to_cpu(value);
+
+	return value;
+}
+
+static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
+{
+	if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+		value = le32_to_cpu(value);
+	else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+		value = be32_to_cpu(value);
+
+	return value;
+}
+
+/**
+ * elf_is_ehdr_sane - check that it is safe to use the ELF header
+ * @buf_len:	size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
+{
+	if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
+		pr_debug("Bad program header size.\n");
+		return false;
+	} else if (ehdr->e_shnum > 0 &&
+		   ehdr->e_shentsize != sizeof(struct elf_shdr)) {
+		pr_debug("Bad section header size.\n");
+		return false;
+	} else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
+		   ehdr->e_version != EV_CURRENT) {
+		pr_debug("Unknown ELF version.\n");
+		return false;
+	}
+
+	if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+		size_t phdr_size;
+
+		/*
+		 * e_phnum is at most 65535 so calculating the size of the
+		 * program header cannot overflow.
+		 */
+		phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+		/* Sanity check the program header table location. */
+		if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
+			pr_debug("Program headers at invalid location.\n");
+			return false;
+		} else if (ehdr->e_phoff + phdr_size > buf_len) {
+			pr_debug("Program headers truncated.\n");
+			return false;
+		}
+	}
+
+	if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+		size_t shdr_size;
+
+		/*
+		 * e_shnum is at most 65536 so calculating
+		 * the size of the section header cannot overflow.
+		 */
+		shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
+
+		/* Sanity check the section header table location. */
+		if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
+			pr_debug("Section headers at invalid location.\n");
+			return false;
+		} else if (ehdr->e_shoff + shdr_size > buf_len) {
+			pr_debug("Section headers truncated.\n");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
+{
+	struct elfhdr *buf_ehdr;
+
+	if (len < sizeof(*buf_ehdr)) {
+		pr_debug("Buffer is too small to hold ELF header.\n");
+		return -ENOEXEC;
+	}
+
+	memset(ehdr, 0, sizeof(*ehdr));
+	memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
+	if (!elf_is_elf_file(ehdr)) {
+		pr_debug("No ELF header magic.\n");
+		return -ENOEXEC;
+	}
+
+	if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
+		pr_debug("Not a supported ELF class.\n");
+		return -ENOEXEC;
+	} else  if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
+		ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
+		pr_debug("Not a supported ELF data format.\n");
+		return -ENOEXEC;
+	}
+
+	buf_ehdr = (struct elfhdr *) buf;
+	if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
+		pr_debug("Bad ELF header size.\n");
+		return -ENOEXEC;
+	}
+
+	ehdr->e_type      = elf16_to_cpu(ehdr, buf_ehdr->e_type);
+	ehdr->e_machine   = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
+	ehdr->e_version   = elf32_to_cpu(ehdr, buf_ehdr->e_version);
+	ehdr->e_entry     = elf_addr_to_cpu(ehdr, buf_ehdr->e_entry);
+	ehdr->e_phoff     = elf_addr_to_cpu(ehdr, buf_ehdr->e_phoff);
+	ehdr->e_shoff     = elf_addr_to_cpu(ehdr, buf_ehdr->e_shoff);
+	ehdr->e_flags     = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
+	ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
+	ehdr->e_phnum     = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
+	ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
+	ehdr->e_shnum     = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
+	ehdr->e_shstrndx  = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
+
+	return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_is_phdr_sane - check that it is safe to use the program header
+ * @buf_len:	size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
+{
+
+	if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
+		pr_debug("ELF segment location wraps around.\n");
+		return false;
+	} else if (phdr->p_offset + phdr->p_filesz > buf_len) {
+		pr_debug("ELF segment not in file.\n");
+		return false;
+	} else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
+		pr_debug("ELF segment address wraps around.\n");
+		return false;
+	}
+
+	return true;
+}
+
+static int elf_read_phdr(const char *buf, size_t len, struct elf_info *elf_info,
+			 int idx)
+{
+	/* Override the const in proghdrs, we are the ones doing the loading. */
+	struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
+	const char *pbuf;
+	struct elf_phdr *buf_phdr;
+
+	pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
+	buf_phdr = (struct elf_phdr *) pbuf;
+
+	phdr->p_type   = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
+	phdr->p_offset = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_offset);
+	phdr->p_paddr  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_paddr);
+	phdr->p_vaddr  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_vaddr);
+	phdr->p_flags  = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
+
+	/*
+	 * The following fields have a type equivalent to Elf_Addr
+	 * both in 32 bit and 64 bit ELF.
+	 */
+	phdr->p_filesz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_filesz);
+	phdr->p_memsz  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_memsz);
+	phdr->p_align  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_align);
+
+	return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_phdrs - read the program headers from the buffer
+ *
+ * This function assumes that the program header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_phdrs(const char *buf, size_t len,
+			  struct elf_info *elf_info)
+{
+	size_t phdr_size, i;
+	const struct elfhdr *ehdr = elf_info->ehdr;
+
+	/*
+	 * e_phnum is at most 65535 so calculating the size of the
+	 * program header cannot overflow.
+	 */
+	phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+	elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
+	if (!elf_info->proghdrs)
+		return -ENOMEM;
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		int ret;
+
+		ret = elf_read_phdr(buf, len, elf_info, i);
+		if (ret) {
+			kfree(elf_info->proghdrs);
+			elf_info->proghdrs = NULL;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * elf_is_shdr_sane - check that it is safe to use the section header
+ * @buf_len:	size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_shdr_sane(const struct elf_shdr *shdr, size_t buf_len)
+{
+	bool size_ok;
+
+	/* SHT_NULL headers have undefined values, so we can't check them. */
+	if (shdr->sh_type == SHT_NULL)
+		return true;
+
+	/* Now verify sh_entsize */
+	switch (shdr->sh_type) {
+	case SHT_SYMTAB:
+		size_ok = shdr->sh_entsize == sizeof(Elf_Sym);
+		break;
+	case SHT_RELA:
+		size_ok = shdr->sh_entsize == sizeof(Elf_Rela);
+		break;
+	case SHT_DYNAMIC:
+		size_ok = shdr->sh_entsize == sizeof(Elf_Dyn);
+		break;
+	case SHT_REL:
+		size_ok = shdr->sh_entsize == sizeof(Elf_Rel);
+		break;
+	case SHT_NOTE:
+	case SHT_PROGBITS:
+	case SHT_HASH:
+	case SHT_NOBITS:
+	default:
+		/*
+		 * This is a section whose entsize requirements
+		 * I don't care about.  If I don't know about
+		 * the section I can't care about it's entsize
+		 * requirements.
+		 */
+		size_ok = true;
+		break;
+	}
+
+	if (!size_ok) {
+		pr_debug("ELF section with wrong entry size.\n");
+		return false;
+	} else if (shdr->sh_addr + shdr->sh_size < shdr->sh_addr) {
+		pr_debug("ELF section address wraps around.\n");
+		return false;
+	}
+
+	if (shdr->sh_type != SHT_NOBITS) {
+		if (shdr->sh_offset + shdr->sh_size < shdr->sh_offset) {
+			pr_debug("ELF section location wraps around.\n");
+			return false;
+		} else if (shdr->sh_offset + shdr->sh_size > buf_len) {
+			pr_debug("ELF section not in file.\n");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int elf_read_shdr(const char *buf, size_t len, struct elf_info *elf_info,
+			 int idx)
+{
+	struct elf_shdr *shdr = &elf_info->sechdrs[idx];
+	const struct elfhdr *ehdr = elf_info->ehdr;
+	const char *sbuf;
+	struct elf_shdr *buf_shdr;
+
+	sbuf = buf + ehdr->e_shoff + idx * sizeof(*buf_shdr);
+	buf_shdr = (struct elf_shdr *) sbuf;
+
+	shdr->sh_name      = elf32_to_cpu(ehdr, buf_shdr->sh_name);
+	shdr->sh_type      = elf32_to_cpu(ehdr, buf_shdr->sh_type);
+	shdr->sh_addr      = elf_addr_to_cpu(ehdr, buf_shdr->sh_addr);
+	shdr->sh_offset    = elf_addr_to_cpu(ehdr, buf_shdr->sh_offset);
+	shdr->sh_link      = elf32_to_cpu(ehdr, buf_shdr->sh_link);
+	shdr->sh_info      = elf32_to_cpu(ehdr, buf_shdr->sh_info);
+
+	/*
+	 * The following fields have a type equivalent to Elf_Addr
+	 * both in 32 bit and 64 bit ELF.
+	 */
+	shdr->sh_flags     = elf_addr_to_cpu(ehdr, buf_shdr->sh_flags);
+	shdr->sh_size      = elf_addr_to_cpu(ehdr, buf_shdr->sh_size);
+	shdr->sh_addralign = elf_addr_to_cpu(ehdr, buf_shdr->sh_addralign);
+	shdr->sh_entsize   = elf_addr_to_cpu(ehdr, buf_shdr->sh_entsize);
+
+	return elf_is_shdr_sane(shdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_shdrs - read the section headers from the buffer
+ *
+ * This function assumes that the section header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_shdrs(const char *buf, size_t len,
+			  struct elf_info *elf_info)
+{
+	size_t shdr_size, i;
+
+	/*
+	 * e_shnum is at most 65536 so calculating
+	 * the size of the section header cannot overflow.
+	 */
+	shdr_size = sizeof(struct elf_shdr) * elf_info->ehdr->e_shnum;
+
+	elf_info->sechdrs = kzalloc(shdr_size, GFP_KERNEL);
+	if (!elf_info->sechdrs)
+		return -ENOMEM;
+
+	for (i = 0; i < elf_info->ehdr->e_shnum; i++) {
+		int ret;
+
+		ret = elf_read_shdr(buf, len, elf_info, i);
+		if (ret) {
+			kfree(elf_info->sechdrs);
+			elf_info->sechdrs = NULL;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
+ * @buf:	Buffer to read ELF file from.
+ * @len:	Size of @buf.
+ * @ehdr:	Pointer to existing struct which will be populated.
+ * @elf_info:	Pointer to existing struct which will be populated.
+ *
+ * This function allows reading ELF files with different byte order than
+ * the kernel, byte-swapping the fields as needed.
+ *
+ * Return:
+ * On success returns 0, and the caller should call elf_free_info(elf_info) to
+ * free the memory allocated for the section and program headers.
+ */
+int elf_read_from_buffer(const char *buf, size_t len, struct elfhdr *ehdr,
+			 struct elf_info *elf_info)
+{
+	int ret;
+
+	ret = elf_read_ehdr(buf, len, ehdr);
+	if (ret)
+		return ret;
+
+	elf_info->buffer = buf;
+	elf_info->ehdr = ehdr;
+	if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+		ret = elf_read_phdrs(buf, len, elf_info);
+		if (ret)
+			return ret;
+	}
+	if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+		ret = elf_read_shdrs(buf, len, elf_info);
+		if (ret) {
+			kfree(elf_info->proghdrs);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * elf_free_info - free memory allocated by elf_read_from_buffer
+ */
+void elf_free_info(struct elf_info *elf_info)
+{
+	kfree(elf_info->proghdrs);
+	kfree(elf_info->sechdrs);
+	memset(elf_info, 0, sizeof(*elf_info));
+}
+/**
+ * build_elf_exec_info - read ELF executable and check that we can use it
+ */
+static int build_elf_exec_info(const char *buf, size_t len, struct elfhdr *ehdr,
+			       struct elf_info *elf_info)
+{
+	int i;
+	int ret;
+
+	ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
+	if (ret)
+		return ret;
+
+	/* Big endian vmlinux has type ET_DYN. */
+	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
+		pr_err("Not an ELF executable.\n");
+		goto error;
+	} else if (!elf_info->proghdrs) {
+		pr_err("No ELF program header.\n");
+		goto error;
+	}
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		/*
+		 * Kexec does not support loading interpreters.
+		 * In addition this check keeps us from attempting
+		 * to kexec ordinay executables.
+		 */
+		if (elf_info->proghdrs[i].p_type == PT_INTERP) {
+			pr_err("Requires an ELF interpreter.\n");
+			goto error;
+		}
+	}
+
+	return 0;
+error:
+	elf_free_info(elf_info);
+	return -ENOEXEC;
+}
+
+static int elf64_probe(const char *buf, unsigned long len)
+{
+	struct elfhdr ehdr;
+	struct elf_info elf_info;
+	int ret;
+
+	ret = build_elf_exec_info(buf, len, &ehdr, &elf_info);
+	if (ret)
+		return ret;
+
+	elf_free_info(&elf_info);
+
+	return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_exec_load - load ELF executable image
+ * @lowest_load_addr:	On return, will be the address where the first PT_LOAD
+ *			section will be loaded in memory.
+ *
+ * Return:
+ * 0 on success, negative value on failure.
+ */
+static int elf_exec_load(struct kimage *image, struct elfhdr *ehdr,
+			 struct elf_info *elf_info,
+			 unsigned long *lowest_load_addr)
+{
+	unsigned long base = 0, lowest_addr = UINT_MAX;
+	int ret;
+	size_t i;
+	struct kexec_buf kbuf = { .image = image, .buf_max = ppc64_rma_size,
+				  .top_down = false };
+
+	/* Read in the PT_LOAD segments. */
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		unsigned long load_addr;
+		size_t size;
+		const struct elf_phdr *phdr;
+
+		phdr = &elf_info->proghdrs[i];
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		size = phdr->p_filesz;
+		if (size > phdr->p_memsz)
+			size = phdr->p_memsz;
+
+		kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset;
+		kbuf.bufsz = size;
+		kbuf.memsz = phdr->p_memsz;
+		kbuf.buf_align = phdr->p_align;
+		kbuf.buf_min = phdr->p_paddr + base;
+		ret = kexec_add_buffer(&kbuf);
+		if (ret)
+			goto out;
+		load_addr = kbuf.mem;
+
+		if (load_addr < lowest_addr)
+			lowest_addr = load_addr;
+	}
+
+	/* Update entry point to reflect new load address. */
+	ehdr->e_entry += base;
+
+	*lowest_load_addr = lowest_addr;
+	ret = 0;
+ out:
+	return ret;
+}
+
+static void *elf64_load(struct kimage *image, char *kernel_buf,
+			unsigned long kernel_len, char *initrd,
+			unsigned long initrd_len, char *cmdline,
+			unsigned long cmdline_len)
+{
+	int ret;
+	unsigned int fdt_size;
+	unsigned long kernel_load_addr;
+	unsigned long initrd_load_addr = 0, fdt_load_addr;
+	void *fdt;
+	const void *slave_code;
+	struct elfhdr ehdr;
+	struct elf_info elf_info;
+	struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+				  .buf_max = ppc64_rma_size };
+	struct kexec_buf pbuf = { .image = image, .buf_min = 0,
+				  .buf_max = ppc64_rma_size, .top_down = true };
+
+	ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info);
+	if (ret)
+		goto out;
+
+	ret = elf_exec_load(image, &ehdr, &elf_info, &kernel_load_addr);
+	if (ret)
+		goto out;
+
+	pr_debug("Loaded the kernel at 0x%lx\n", kernel_load_addr);
+
+	ret = kexec_load_purgatory(image, &pbuf);
+	if (ret) {
+		pr_err("Loading purgatory failed.\n");
+		goto out;
+	}
+
+	pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+
+	if (initrd != NULL) {
+		kbuf.buffer = initrd;
+		kbuf.bufsz = kbuf.memsz = initrd_len;
+		kbuf.buf_align = PAGE_SIZE;
+		kbuf.top_down = false;
+		ret = kexec_add_buffer(&kbuf);
+		if (ret)
+			goto out;
+		initrd_load_addr = kbuf.mem;
+
+		pr_debug("Loaded initrd at 0x%lx\n", initrd_load_addr);
+	}
+
+	fdt_size = fdt_totalsize(initial_boot_params) * 2;
+	fdt = kmalloc(fdt_size, GFP_KERNEL);
+	if (!fdt) {
+		pr_err("Not enough memory for the device tree.\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = fdt_open_into(initial_boot_params, fdt, fdt_size);
+	if (ret < 0) {
+		pr_err("Error setting up the new device tree.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+	if (ret)
+		goto out;
+
+	fdt_pack(fdt);
+
+	kbuf.buffer = fdt;
+	kbuf.bufsz = kbuf.memsz = fdt_size;
+	kbuf.buf_align = PAGE_SIZE;
+	kbuf.top_down = true;
+	ret = kexec_add_buffer(&kbuf);
+	if (ret)
+		goto out;
+	fdt_load_addr = kbuf.mem;
+
+	pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
+
+	slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
+	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
+			      fdt_load_addr);
+	if (ret)
+		pr_err("Error setting up the purgatory.\n");
+
+out:
+	elf_free_info(&elf_info);
+
+	/* Make kimage_file_post_load_cleanup free the fdt buffer for us. */
+	return ret ? ERR_PTR(ret) : fdt;
+}
+
+const struct kexec_file_ops kexec_elf64_ops = {
+	.probe = elf64_probe,
+	.load = elf64_load,
+};
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
new file mode 100644
index 000000000..59c578f86
--- /dev/null
+++ b/arch/powerpc/kernel/kgdb.c
@@ -0,0 +1,525 @@
+/*
+ * PowerPC backend to the KGDB stub.
+ *
+ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu)
+ * Copyright (C) 2003 Timesys Corporation.
+ * Copyright (C) 2004-2006 MontaVista Software, Inc.
+ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com)
+ * PPC32 support restored by Vitaly Wool <vwool@ru.mvista.com> and
+ * Sergei Shtylyov <sshtylyov@ru.mvista.com>
+ * Copyright (C) 2007-2008 Wind River Systems, Inc.
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program as licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/signal.h>
+#include <linux/ptrace.h>
+#include <linux/kdebug.h>
+#include <asm/current.h>
+#include <asm/processor.h>
+#include <asm/machdep.h>
+#include <asm/debug.h>
+#include <asm/code-patching.h>
+#include <linux/slab.h>
+
+/*
+ * This table contains the mapping between PowerPC hardware trap types, and
+ * signals, which are primarily what GDB understands.  GDB and the kernel
+ * don't always agree on values, so we use constants taken from gdb-6.2.
+ */
+static struct hard_trap_info
+{
+	unsigned int tt;		/* Trap type code for powerpc */
+	unsigned char signo;		/* Signal that we map this trap into */
+} hard_trap_info[] = {
+	{ 0x0100, 0x02 /* SIGINT */  },		/* system reset */
+	{ 0x0200, 0x0b /* SIGSEGV */ },		/* machine check */
+	{ 0x0300, 0x0b /* SIGSEGV */ },		/* data access */
+	{ 0x0400, 0x0b /* SIGSEGV */ },		/* instruction access */
+	{ 0x0500, 0x02 /* SIGINT */  },		/* external interrupt */
+	{ 0x0600, 0x0a /* SIGBUS */  },		/* alignment */
+	{ 0x0700, 0x05 /* SIGTRAP */ },		/* program check */
+	{ 0x0800, 0x08 /* SIGFPE */  },		/* fp unavailable */
+	{ 0x0900, 0x0e /* SIGALRM */ },		/* decrementer */
+	{ 0x0c00, 0x14 /* SIGCHLD */ },		/* system call */
+#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+	{ 0x2002, 0x05 /* SIGTRAP */ },		/* debug */
+#if defined(CONFIG_FSL_BOOKE)
+	{ 0x2010, 0x08 /* SIGFPE */  },		/* spe unavailable */
+	{ 0x2020, 0x08 /* SIGFPE */  },		/* spe unavailable */
+	{ 0x2030, 0x08 /* SIGFPE */  },		/* spe fp data */
+	{ 0x2040, 0x08 /* SIGFPE */  },		/* spe fp data */
+	{ 0x2050, 0x08 /* SIGFPE */  },		/* spe fp round */
+	{ 0x2060, 0x0e /* SIGILL */  },		/* performance monitor */
+	{ 0x2900, 0x08 /* SIGFPE */  },		/* apu unavailable */
+	{ 0x3100, 0x0e /* SIGALRM */ },		/* fixed interval timer */
+	{ 0x3200, 0x02 /* SIGINT */  }, 	/* watchdog */
+#else /* ! CONFIG_FSL_BOOKE */
+	{ 0x1000, 0x0e /* SIGALRM */ },		/* prog interval timer */
+	{ 0x1010, 0x0e /* SIGALRM */ },		/* fixed interval timer */
+	{ 0x1020, 0x02 /* SIGINT */  }, 	/* watchdog */
+	{ 0x2010, 0x08 /* SIGFPE */  },		/* fp unavailable */
+	{ 0x2020, 0x08 /* SIGFPE */  },		/* ap unavailable */
+#endif
+#else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */
+	{ 0x0d00, 0x05 /* SIGTRAP */ },		/* single-step */
+#if defined(CONFIG_PPC_8xx)
+	{ 0x1000, 0x04 /* SIGILL */  },		/* software emulation */
+#else /* ! CONFIG_PPC_8xx */
+	{ 0x0f00, 0x04 /* SIGILL */  },		/* performance monitor */
+	{ 0x0f20, 0x08 /* SIGFPE */  },		/* altivec unavailable */
+	{ 0x1300, 0x05 /* SIGTRAP */ }, 	/* instruction address break */
+#if defined(CONFIG_PPC64)
+	{ 0x1200, 0x05 /* SIGILL */  },		/* system error */
+	{ 0x1500, 0x04 /* SIGILL */  },		/* soft patch */
+	{ 0x1600, 0x04 /* SIGILL */  },		/* maintenance */
+	{ 0x1700, 0x08 /* SIGFPE */  },		/* altivec assist */
+	{ 0x1800, 0x04 /* SIGILL */  },		/* thermal */
+#else /* ! CONFIG_PPC64 */
+	{ 0x1400, 0x02 /* SIGINT */  },		/* SMI */
+	{ 0x1600, 0x08 /* SIGFPE */  },		/* altivec assist */
+	{ 0x1700, 0x04 /* SIGILL */  },		/* TAU */
+	{ 0x2000, 0x05 /* SIGTRAP */ },		/* run mode */
+#endif
+#endif
+#endif
+	{ 0x0000, 0x00 }			/* Must be last */
+};
+
+static int computeSignal(unsigned int tt)
+{
+	struct hard_trap_info *ht;
+
+	for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+		if (ht->tt == tt)
+			return ht->signo;
+
+	return SIGHUP;		/* default for things we don't know about */
+}
+
+/**
+ *
+ *	kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ *	@exception: Exception vector number
+ *	@regs: Current &struct pt_regs.
+ *
+ *	On some architectures we need to skip a breakpoint exception when
+ *	it occurs after a breakpoint has been removed.
+ *
+ */
+int kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+	return kgdb_isremovedbreak(regs->nip);
+}
+
+static int kgdb_call_nmi_hook(struct pt_regs *regs)
+{
+	kgdb_nmicallback(raw_smp_processor_id(), regs);
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+void kgdb_roundup_cpus(unsigned long flags)
+{
+	smp_send_debugger_break();
+}
+#endif
+
+/* KGDB functions to use existing PowerPC64 hooks. */
+static int kgdb_debugger(struct pt_regs *regs)
+{
+	return !kgdb_handle_exception(1, computeSignal(TRAP(regs)),
+				      DIE_OOPS, regs);
+}
+
+static int kgdb_handle_breakpoint(struct pt_regs *regs)
+{
+	if (user_mode(regs))
+		return 0;
+
+	if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0)
+		return 0;
+
+	if (*(u32 *)regs->nip == BREAK_INSTR)
+		regs->nip += BREAK_INSTR_SIZE;
+
+	return 1;
+}
+
+static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
+static int kgdb_singlestep(struct pt_regs *regs)
+{
+	struct thread_info *thread_info, *exception_thread_info;
+	struct thread_info *backup_current_thread_info =
+		this_cpu_ptr(&kgdb_thread_info);
+
+	if (user_mode(regs))
+		return 0;
+
+	/*
+	 * On Book E and perhaps other processors, singlestep is handled on
+	 * the critical exception stack.  This causes current_thread_info()
+	 * to fail, since it it locates the thread_info by masking off
+	 * the low bits of the current stack pointer.  We work around
+	 * this issue by copying the thread_info from the kernel stack
+	 * before calling kgdb_handle_exception, and copying it back
+	 * afterwards.  On most processors the copy is avoided since
+	 * exception_thread_info == thread_info.
+	 */
+	thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
+	exception_thread_info = current_thread_info();
+
+	if (thread_info != exception_thread_info) {
+		/* Save the original current_thread_info. */
+		memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info);
+		memcpy(exception_thread_info, thread_info, sizeof *thread_info);
+	}
+
+	kgdb_handle_exception(0, SIGTRAP, 0, regs);
+
+	if (thread_info != exception_thread_info)
+		/* Restore current_thread_info lastly. */
+		memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
+
+	return 1;
+}
+
+static int kgdb_iabr_match(struct pt_regs *regs)
+{
+	if (user_mode(regs))
+		return 0;
+
+	if (kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs) != 0)
+		return 0;
+	return 1;
+}
+
+static int kgdb_break_match(struct pt_regs *regs)
+{
+	if (user_mode(regs))
+		return 0;
+
+	if (kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs) != 0)
+		return 0;
+	return 1;
+}
+
+#define PACK64(ptr, src) do { *(ptr++) = (src); } while (0)
+
+#define PACK32(ptr, src) do {          \
+	u32 *ptr32;                   \
+	ptr32 = (u32 *)ptr;           \
+	*(ptr32++) = (src);           \
+	ptr = (unsigned long *)ptr32; \
+	} while (0)
+
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
+{
+	struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp +
+						  STACK_FRAME_OVERHEAD);
+	unsigned long *ptr = gdb_regs;
+	int reg;
+
+	memset(gdb_regs, 0, NUMREGBYTES);
+
+	/* Regs GPR0-2 */
+	for (reg = 0; reg < 3; reg++)
+		PACK64(ptr, regs->gpr[reg]);
+
+	/* Regs GPR3-13 are caller saved, not in regs->gpr[] */
+	ptr += 11;
+
+	/* Regs GPR14-31 */
+	for (reg = 14; reg < 32; reg++)
+		PACK64(ptr, regs->gpr[reg]);
+
+#ifdef CONFIG_FSL_BOOKE
+#ifdef CONFIG_SPE
+	for (reg = 0; reg < 32; reg++)
+		PACK64(ptr, p->thread.evr[reg]);
+#else
+	ptr += 32;
+#endif
+#else
+	/* fp registers not used by kernel, leave zero */
+	ptr += 32 * 8 / sizeof(long);
+#endif
+
+	PACK64(ptr, regs->nip);
+	PACK64(ptr, regs->msr);
+	PACK32(ptr, regs->ccr);
+	PACK64(ptr, regs->link);
+	PACK64(ptr, regs->ctr);
+	PACK32(ptr, regs->xer);
+
+	BUG_ON((unsigned long)ptr >
+	       (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
+}
+
+#define GDB_SIZEOF_REG sizeof(unsigned long)
+#define GDB_SIZEOF_REG_U32 sizeof(u32)
+
+#ifdef CONFIG_FSL_BOOKE
+#define GDB_SIZEOF_FLOAT_REG sizeof(unsigned long)
+#else
+#define GDB_SIZEOF_FLOAT_REG sizeof(u64)
+#endif
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
+{
+	{ "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[0]) },
+	{ "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[1]) },
+	{ "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[2]) },
+	{ "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[3]) },
+	{ "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[4]) },
+	{ "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[5]) },
+	{ "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[6]) },
+	{ "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[7]) },
+	{ "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[8]) },
+	{ "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[9]) },
+	{ "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[10]) },
+	{ "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[11]) },
+	{ "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[12]) },
+	{ "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[13]) },
+	{ "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[14]) },
+	{ "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[15]) },
+	{ "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[16]) },
+	{ "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[17]) },
+	{ "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[18]) },
+	{ "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[19]) },
+	{ "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[20]) },
+	{ "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[21]) },
+	{ "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[22]) },
+	{ "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[23]) },
+	{ "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[24]) },
+	{ "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[25]) },
+	{ "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[26]) },
+	{ "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[27]) },
+	{ "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[28]) },
+	{ "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[29]) },
+	{ "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[30]) },
+	{ "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[31]) },
+
+	{ "f0", GDB_SIZEOF_FLOAT_REG, 0 },
+	{ "f1", GDB_SIZEOF_FLOAT_REG, 1 },
+	{ "f2", GDB_SIZEOF_FLOAT_REG, 2 },
+	{ "f3", GDB_SIZEOF_FLOAT_REG, 3 },
+	{ "f4", GDB_SIZEOF_FLOAT_REG, 4 },
+	{ "f5", GDB_SIZEOF_FLOAT_REG, 5 },
+	{ "f6", GDB_SIZEOF_FLOAT_REG, 6 },
+	{ "f7", GDB_SIZEOF_FLOAT_REG, 7 },
+	{ "f8", GDB_SIZEOF_FLOAT_REG, 8 },
+	{ "f9", GDB_SIZEOF_FLOAT_REG, 9 },
+	{ "f10", GDB_SIZEOF_FLOAT_REG, 10 },
+	{ "f11", GDB_SIZEOF_FLOAT_REG, 11 },
+	{ "f12", GDB_SIZEOF_FLOAT_REG, 12 },
+	{ "f13", GDB_SIZEOF_FLOAT_REG, 13 },
+	{ "f14", GDB_SIZEOF_FLOAT_REG, 14 },
+	{ "f15", GDB_SIZEOF_FLOAT_REG, 15 },
+	{ "f16", GDB_SIZEOF_FLOAT_REG, 16 },
+	{ "f17", GDB_SIZEOF_FLOAT_REG, 17 },
+	{ "f18", GDB_SIZEOF_FLOAT_REG, 18 },
+	{ "f19", GDB_SIZEOF_FLOAT_REG, 19 },
+	{ "f20", GDB_SIZEOF_FLOAT_REG, 20 },
+	{ "f21", GDB_SIZEOF_FLOAT_REG, 21 },
+	{ "f22", GDB_SIZEOF_FLOAT_REG, 22 },
+	{ "f23", GDB_SIZEOF_FLOAT_REG, 23 },
+	{ "f24", GDB_SIZEOF_FLOAT_REG, 24 },
+	{ "f25", GDB_SIZEOF_FLOAT_REG, 25 },
+	{ "f26", GDB_SIZEOF_FLOAT_REG, 26 },
+	{ "f27", GDB_SIZEOF_FLOAT_REG, 27 },
+	{ "f28", GDB_SIZEOF_FLOAT_REG, 28 },
+	{ "f29", GDB_SIZEOF_FLOAT_REG, 29 },
+	{ "f30", GDB_SIZEOF_FLOAT_REG, 30 },
+	{ "f31", GDB_SIZEOF_FLOAT_REG, 31 },
+
+	{ "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, nip) },
+	{ "msr", GDB_SIZEOF_REG, offsetof(struct pt_regs, msr) },
+	{ "cr", GDB_SIZEOF_REG_U32, offsetof(struct pt_regs, ccr) },
+	{ "lr", GDB_SIZEOF_REG, offsetof(struct pt_regs, link) },
+	{ "ctr", GDB_SIZEOF_REG_U32, offsetof(struct pt_regs, ctr) },
+	{ "xer", GDB_SIZEOF_REG, offsetof(struct pt_regs, xer) },
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (regno < 32 || regno >= 64)
+		/* First 0 -> 31 gpr registers*/
+		/* pc, msr, ls... registers 64 -> 69 */
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+				dbg_reg_def[regno].size);
+
+	if (regno >= 32 && regno < 64) {
+		/* FP registers 32 -> 63 */
+#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE)
+		if (current)
+			memcpy(mem, &current->thread.evr[regno-32],
+					dbg_reg_def[regno].size);
+#else
+		/* fp registers not used by kernel, leave zero */
+		memset(mem, 0, dbg_reg_def[regno].size);
+#endif
+	}
+
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return -EINVAL;
+
+	if (regno < 32 || regno >= 64)
+		/* First 0 -> 31 gpr registers*/
+		/* pc, msr, ls... registers 64 -> 69 */
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+				dbg_reg_def[regno].size);
+
+	if (regno >= 32 && regno < 64) {
+		/* FP registers 32 -> 63 */
+#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE)
+		memcpy(&current->thread.evr[regno-32], mem,
+				dbg_reg_def[regno].size);
+#else
+		/* fp registers not used by kernel, leave zero */
+		return 0;
+#endif
+	}
+
+	return 0;
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->nip = pc;
+}
+
+/*
+ * This function does PowerPC specific procesing for interfacing to gdb.
+ */
+int kgdb_arch_handle_exception(int vector, int signo, int err_code,
+			       char *remcom_in_buffer, char *remcom_out_buffer,
+			       struct pt_regs *linux_regs)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long addr;
+
+	switch (remcom_in_buffer[0]) {
+		/*
+		 * sAA..AA   Step one instruction from AA..AA
+		 * This will return an error to gdb ..
+		 */
+	case 's':
+	case 'c':
+		/* handle the optional parameter */
+		if (kgdb_hex2long(&ptr, &addr))
+			linux_regs->nip = addr;
+
+		atomic_set(&kgdb_cpu_doing_single_step, -1);
+		/* set the trace bit if we're stepping */
+		if (remcom_in_buffer[0] == 's') {
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+			mtspr(SPRN_DBCR0,
+			      mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
+			linux_regs->msr |= MSR_DE;
+#else
+			linux_regs->msr |= MSR_SE;
+#endif
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
+		}
+		return 0;
+	}
+
+	return -1;
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned int instr;
+	unsigned int *addr = (unsigned int *)bpt->bpt_addr;
+
+	err = probe_kernel_address(addr, instr);
+	if (err)
+		return err;
+
+	err = patch_instruction(addr, BREAK_INSTR);
+	if (err)
+		return -EFAULT;
+
+	*(unsigned int *)bpt->saved_instr = instr;
+
+	return 0;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned int instr = *(unsigned int *)bpt->saved_instr;
+	unsigned int *addr = (unsigned int *)bpt->bpt_addr;
+
+	err = patch_instruction(addr, instr);
+	if (err)
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Global data
+ */
+struct kgdb_arch arch_kgdb_ops;
+
+static int kgdb_not_implemented(struct pt_regs *regs)
+{
+	return 0;
+}
+
+static void *old__debugger_ipi;
+static void *old__debugger;
+static void *old__debugger_bpt;
+static void *old__debugger_sstep;
+static void *old__debugger_iabr_match;
+static void *old__debugger_break_match;
+static void *old__debugger_fault_handler;
+
+int kgdb_arch_init(void)
+{
+	old__debugger_ipi = __debugger_ipi;
+	old__debugger = __debugger;
+	old__debugger_bpt = __debugger_bpt;
+	old__debugger_sstep = __debugger_sstep;
+	old__debugger_iabr_match = __debugger_iabr_match;
+	old__debugger_break_match = __debugger_break_match;
+	old__debugger_fault_handler = __debugger_fault_handler;
+
+	__debugger_ipi = kgdb_call_nmi_hook;
+	__debugger = kgdb_debugger;
+	__debugger_bpt = kgdb_handle_breakpoint;
+	__debugger_sstep = kgdb_singlestep;
+	__debugger_iabr_match = kgdb_iabr_match;
+	__debugger_break_match = kgdb_break_match;
+	__debugger_fault_handler = kgdb_not_implemented;
+
+	return 0;
+}
+
+void kgdb_arch_exit(void)
+{
+	__debugger_ipi = old__debugger_ipi;
+	__debugger = old__debugger;
+	__debugger_bpt = old__debugger_bpt;
+	__debugger_sstep = old__debugger_sstep;
+	__debugger_iabr_match = old__debugger_iabr_match;
+	__debugger_break_match = old__debugger_break_match;
+	__debugger_fault_handler = old__debugger_fault_handler;
+}
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
new file mode 100644
index 000000000..e4a49c051
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -0,0 +1,76 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+ *		  IBM Corporation
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+/* Ftrace callback handler for kprobes */
+void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
+			   struct ftrace_ops *ops, struct pt_regs *regs)
+{
+	struct kprobe *p;
+	struct kprobe_ctlblk *kcb;
+
+	p = get_kprobe((kprobe_opcode_t *)nip);
+	if (unlikely(!p) || kprobe_disabled(p))
+		return;
+
+	kcb = get_kprobe_ctlblk();
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(p);
+	} else {
+		/*
+		 * On powerpc, NIP is *before* this instruction for the
+		 * pre handler
+		 */
+		regs->nip -= MCOUNT_INSN_SIZE;
+
+		__this_cpu_write(current_kprobe, p);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		if (!p->pre_handler || !p->pre_handler(p, regs)) {
+			/*
+			 * Emulate singlestep (and also recover regs->nip)
+			 * as if there is a nop
+			 */
+			regs->nip += MCOUNT_INSN_SIZE;
+			if (unlikely(p->post_handler)) {
+				kcb->kprobe_status = KPROBE_HIT_SSDONE;
+				p->post_handler(p, regs, 0);
+			}
+		}
+		/*
+		 * If pre_handler returns !0, it changes regs->nip. We have to
+		 * skip emulating post_handler.
+		 */
+		__this_cpu_write(current_kprobe, NULL);
+	}
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+	p->ainsn.insn = NULL;
+	p->ainsn.boostable = -1;
+	return 0;
+}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
new file mode 100644
index 000000000..ccf16bccc
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes.c
@@ -0,0 +1,629 @@
+/*
+ *  Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes contributions from
+ *		Rusty Russell).
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Nov	Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port
+ *		for PPC64
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/slab.h>
+#include <asm/code-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/sstep.h>
+#include <asm/sections.h>
+#include <linux/uaccess.h>
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+	return  (addr >= (unsigned long)__kprobes_text_start &&
+		 addr < (unsigned long)__kprobes_text_end) ||
+		(addr >= (unsigned long)_stext &&
+		 addr < (unsigned long)__head_end);
+}
+
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
+{
+	kprobe_opcode_t *addr = NULL;
+
+#ifdef PPC64_ELF_ABI_v2
+	/* PPC64 ABIv2 needs local entry point */
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	if (addr && !offset) {
+#ifdef CONFIG_KPROBES_ON_FTRACE
+		unsigned long faddr;
+		/*
+		 * Per livepatch.h, ftrace location is always within the first
+		 * 16 bytes of a function on powerpc with -mprofile-kernel.
+		 */
+		faddr = ftrace_location_range((unsigned long)addr,
+					      (unsigned long)addr + 16);
+		if (faddr)
+			addr = (kprobe_opcode_t *)faddr;
+		else
+#endif
+			addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+	}
+#elif defined(PPC64_ELF_ABI_v1)
+	/*
+	 * 64bit powerpc ABIv1 uses function descriptors:
+	 * - Check for the dot variant of the symbol first.
+	 * - If that fails, try looking up the symbol provided.
+	 *
+	 * This ensures we always get to the actual symbol and not
+	 * the descriptor.
+	 *
+	 * Also handle <module:symbol> format.
+	 */
+	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
+	bool dot_appended = false;
+	const char *c;
+	ssize_t ret = 0;
+	int len = 0;
+
+	if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+		c++;
+		len = c - name;
+		memcpy(dot_name, name, len);
+	} else
+		c = name;
+
+	if (*c != '\0' && *c != '.') {
+		dot_name[len++] = '.';
+		dot_appended = true;
+	}
+	ret = strscpy(dot_name + len, c, KSYM_NAME_LEN);
+	if (ret > 0)
+		addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+
+	/* Fallback to the original non-dot symbol lookup */
+	if (!addr && dot_appended)
+		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+#else
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+#endif
+
+	return addr;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+	int ret = 0;
+	kprobe_opcode_t insn = *p->addr;
+
+	if ((unsigned long)p->addr & 0x03) {
+		printk("Attempt to register kprobe at an unaligned address\n");
+		ret = -EINVAL;
+	} else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) {
+		printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n");
+		ret = -EINVAL;
+	}
+
+	/* insn must be on a special executable page on ppc64.  This is
+	 * not explicitly required on ppc32 (right now), but it doesn't hurt */
+	if (!ret) {
+		p->ainsn.insn = get_insn_slot();
+		if (!p->ainsn.insn)
+			ret = -ENOMEM;
+	}
+
+	if (!ret) {
+		memcpy(p->ainsn.insn, p->addr,
+				MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+		p->opcode = *p->addr;
+		flush_icache_range((unsigned long)p->ainsn.insn,
+			(unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t));
+	}
+
+	p->ainsn.boostable = 0;
+	return ret;
+}
+NOKPROBE_SYMBOL(arch_prepare_kprobe);
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+	patch_instruction(p->addr, BREAKPOINT_INSTRUCTION);
+}
+NOKPROBE_SYMBOL(arch_arm_kprobe);
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+	patch_instruction(p->addr, p->opcode);
+}
+NOKPROBE_SYMBOL(arch_disarm_kprobe);
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+	if (p->ainsn.insn) {
+		free_insn_slot(p->ainsn.insn, 0);
+		p->ainsn.insn = NULL;
+	}
+}
+NOKPROBE_SYMBOL(arch_remove_kprobe);
+
+static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	enable_single_step(regs);
+
+	/*
+	 * On powerpc we should single step on the original
+	 * instruction even if the probed insn is a trap
+	 * variant as values in regs could play a part in
+	 * if the trap is taken or not
+	 */
+	regs->nip = (unsigned long)p->ainsn.insn;
+}
+
+static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr;
+}
+
+static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr;
+}
+
+static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+				struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, p);
+	kcb->kprobe_saved_msr = regs->msr;
+}
+
+bool arch_kprobe_on_func_entry(unsigned long offset)
+{
+#ifdef PPC64_ELF_ABI_v2
+#ifdef CONFIG_KPROBES_ON_FTRACE
+	return offset <= 16;
+#else
+	return offset <= 8;
+#endif
+#else
+	return !offset;
+#endif
+}
+
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	ri->ret_addr = (kprobe_opcode_t *)regs->link;
+
+	/* Replace the return addr with trampoline addr */
+	regs->link = (unsigned long)kretprobe_trampoline;
+}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
+
+static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
+{
+	int ret;
+	unsigned int insn = *p->ainsn.insn;
+
+	/* regs->nip is also adjusted if emulate_step returns 1 */
+	ret = emulate_step(regs, insn);
+	if (ret > 0) {
+		/*
+		 * Once this instruction has been boosted
+		 * successfully, set the boostable flag
+		 */
+		if (unlikely(p->ainsn.boostable == 0))
+			p->ainsn.boostable = 1;
+	} else if (ret < 0) {
+		/*
+		 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
+		 * So, we should never get here... but, its still
+		 * good to catch them, just in case...
+		 */
+		printk("Can't step on instruction %x\n", insn);
+		BUG();
+	} else {
+		/*
+		 * If we haven't previously emulated this instruction, then it
+		 * can't be boosted. Note it down so we don't try to do so again.
+		 *
+		 * If, however, we had emulated this instruction in the past,
+		 * then this is just an error with the current run (for
+		 * instance, exceptions due to a load/store). We return 0 so
+		 * that this is now single-stepped, but continue to try
+		 * emulating it in subsequent probe hits.
+		 */
+		if (unlikely(p->ainsn.boostable != 1))
+			p->ainsn.boostable = -1;
+	}
+
+	return ret;
+}
+NOKPROBE_SYMBOL(try_to_emulate);
+
+int kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	unsigned int *addr = (unsigned int *)regs->nip;
+	struct kprobe_ctlblk *kcb;
+
+	if (user_mode(regs))
+		return 0;
+
+	if (!IS_ENABLED(CONFIG_BOOKE) &&
+	    (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)))
+		return 0;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			kprobe_opcode_t insn = *p->ainsn.insn;
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+					is_trap(insn)) {
+				/* Turn off 'trace' bits */
+				regs->msr &= ~MSR_SINGLESTEP;
+				regs->msr |= kcb->kprobe_saved_msr;
+				goto no_kprobe;
+			}
+			/* We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			kcb->kprobe_status = KPROBE_REENTER;
+			if (p->ainsn.boostable >= 0) {
+				ret = try_to_emulate(p, regs);
+
+				if (ret > 0) {
+					restore_previous_kprobe(kcb);
+					preempt_enable_no_resched();
+					return 1;
+				}
+			}
+			prepare_singlestep(p, regs);
+			return 1;
+		} else if (*addr != BREAKPOINT_INSTRUCTION) {
+			/* If trap variant, then it belongs not to us */
+			kprobe_opcode_t cur_insn = *addr;
+
+			if (is_trap(cur_insn))
+				goto no_kprobe;
+			/* The breakpoint instruction was removed by
+			 * another cpu right after we hit, no further
+			 * handling of this interrupt is appropriate
+			 */
+			ret = 1;
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != BREAKPOINT_INSTRUCTION) {
+			/*
+			 * PowerPC has multiple variants of the "trap"
+			 * instruction. If the current instruction is a
+			 * trap variant, it could belong to someone else
+			 */
+			kprobe_opcode_t cur_insn = *addr;
+			if (is_trap(cur_insn))
+				goto no_kprobe;
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it */
+		goto no_kprobe;
+	}
+
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+	set_current_kprobe(p, regs, kcb);
+	if (p->pre_handler && p->pre_handler(p, regs)) {
+		/* handler changed execution path, so skip ss setup */
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+		return 1;
+	}
+
+	if (p->ainsn.boostable >= 0) {
+		ret = try_to_emulate(p, regs);
+
+		if (ret > 0) {
+			if (p->post_handler)
+				p->post_handler(p, regs, 0);
+
+			kcb->kprobe_status = KPROBE_HIT_SSDONE;
+			reset_current_kprobe();
+			preempt_enable_no_resched();
+			return 1;
+		}
+	}
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+NOKPROBE_SYMBOL(kprobe_handler);
+
+/*
+ * Function return probe trampoline:
+ * 	- init_kprobes() establishes a probepoint here
+ * 	- When the probed function returns, this probe
+ * 		causes the handlers to fire
+ */
+asm(".global kretprobe_trampoline\n"
+	".type kretprobe_trampoline, @function\n"
+	"kretprobe_trampoline:\n"
+	"nop\n"
+	"blr\n"
+	".size kretprobe_trampoline, .-kretprobe_trampoline\n");
+
+/*
+ * Called when the probe at kretprobe trampoline is hit
+ */
+static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	kretprobe_hash_lock(current, &head, &flags);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because an multiple functions in the call path
+	 * have a return probe installed on them, and/or more than one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+	/*
+	 * We get here through one of two paths:
+	 * 1. by taking a trap -> kprobe_handler() -> here
+	 * 2. by optprobe branch -> optimized_callback() -> opt_pre_handler() -> here
+	 *
+	 * When going back through (1), we need regs->nip to be setup properly
+	 * as it is used to determine the return address from the trap.
+	 * For (2), since nip is not honoured with optprobes, we instead setup
+	 * the link register properly so that the subsequent 'blr' in
+	 * kretprobe_trampoline jumps back to the right instruction.
+	 *
+	 * For nip, we should set the address to the previous instruction since
+	 * we end up emulating it in kprobe_handler(), which increments the nip
+	 * again.
+	 */
+	regs->nip = orig_ret_address - 4;
+	regs->link = orig_ret_address;
+
+	kretprobe_hash_unlock(current, &flags);
+
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+
+	return 0;
+}
+NOKPROBE_SYMBOL(trampoline_probe_handler);
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "breakpoint"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ */
+int kprobe_post_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur || user_mode(regs))
+		return 0;
+
+	/* make sure we got here for instruction we have a kprobe on */
+	if (((unsigned long)cur->ainsn.insn + 4) != regs->nip)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	/* Adjust nip to after the single-stepped instruction */
+	regs->nip = (unsigned long)cur->addr + 4;
+	regs->msr |= kcb->kprobe_saved_msr;
+
+	/*Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, msr
+	 * will have DE/SE set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->msr & MSR_SINGLESTEP)
+		return 0;
+
+	return 1;
+}
+NOKPROBE_SYMBOL(kprobe_post_handler);
+
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	const struct exception_table_entry *entry;
+
+	switch(kcb->kprobe_status) {
+	case KPROBE_HIT_SS:
+	case KPROBE_REENTER:
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the nip points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		regs->nip = (unsigned long)cur->addr;
+		regs->msr &= ~MSR_SINGLESTEP; /* Turn off 'trace' bits */
+		regs->msr |= kcb->kprobe_saved_msr;
+		if (kcb->kprobe_status == KPROBE_REENTER)
+			restore_previous_kprobe(kcb);
+		else
+			reset_current_kprobe();
+		preempt_enable_no_resched();
+		break;
+	case KPROBE_HIT_ACTIVE:
+	case KPROBE_HIT_SSDONE:
+		/*
+		 * We increment the nmissed count for accounting,
+		 * we can also use npre/npostfault count for accounting
+		 * these specific fault cases.
+		 */
+		kprobes_inc_nmissed_count(cur);
+
+		/*
+		 * We come here because instructions in the pre/post
+		 * handler caused the page_fault, this could happen
+		 * if handler tries to access user space by
+		 * copy_from_user(), get_user() etc. Let the
+		 * user-specified handler try to fix it first.
+		 */
+		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+			return 1;
+
+		/*
+		 * In case the user-specified fault handler returned
+		 * zero, try to fix up.
+		 */
+		if ((entry = search_exception_tables(regs->nip)) != NULL) {
+			regs->nip = extable_fixup(entry);
+			return 1;
+		}
+
+		/*
+		 * fixup_exception() could not handle it,
+		 * Let do_page_fault() fix it.
+		 */
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
+
+unsigned long arch_deref_entry_point(void *entry)
+{
+#ifdef PPC64_ELF_ABI_v1
+	if (!kernel_text_address((unsigned long)entry))
+		return ppc_global_function_entry(entry);
+	else
+#endif
+		return (unsigned long)entry;
+}
+NOKPROBE_SYMBOL(arch_deref_entry_point);
+
+static struct kprobe trampoline_p = {
+	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+	.pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+	return register_kprobe(&trampoline_p);
+}
+
+int arch_trampoline_kprobe(struct kprobe *p)
+{
+	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
+		return 1;
+
+	return 0;
+}
+NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
new file mode 100644
index 000000000..2283b9bfd
--- /dev/null
+++ b/arch/powerpc/kernel/kvm.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/kmemleak.h>
+#include <linux/kvm_para.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/pagemap.h>
+
+#include <asm/reg.h>
+#include <asm/sections.h>
+#include <asm/cacheflush.h>
+#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include <asm/epapr_hcalls.h>
+
+#define KVM_MAGIC_PAGE		(-4096L)
+#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
+
+#define KVM_INST_LWZ		0x80000000
+#define KVM_INST_STW		0x90000000
+#define KVM_INST_LD		0xe8000000
+#define KVM_INST_STD		0xf8000000
+#define KVM_INST_NOP		0x60000000
+#define KVM_INST_B		0x48000000
+#define KVM_INST_B_MASK		0x03ffffff
+#define KVM_INST_B_MAX		0x01ffffff
+#define KVM_INST_LI		0x38000000
+
+#define KVM_MASK_RT		0x03e00000
+#define KVM_RT_30		0x03c00000
+#define KVM_MASK_RB		0x0000f800
+#define KVM_INST_MFMSR		0x7c0000a6
+
+#define SPR_FROM		0
+#define SPR_TO			0x100
+
+#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \
+				    (((sprn) & 0x1f) << 16) | \
+				    (((sprn) & 0x3e0) << 6) | \
+				    (moveto))
+
+#define KVM_INST_MFSPR(sprn)	KVM_INST_SPR(sprn, SPR_FROM)
+#define KVM_INST_MTSPR(sprn)	KVM_INST_SPR(sprn, SPR_TO)
+
+#define KVM_INST_TLBSYNC	0x7c00046c
+#define KVM_INST_MTMSRD_L0	0x7c000164
+#define KVM_INST_MTMSRD_L1	0x7c010164
+#define KVM_INST_MTMSR		0x7c000124
+
+#define KVM_INST_WRTEE		0x7c000106
+#define KVM_INST_WRTEEI_0	0x7c000146
+#define KVM_INST_WRTEEI_1	0x7c008146
+
+#define KVM_INST_MTSRIN		0x7c0001e4
+
+static bool kvm_patching_worked = true;
+char kvm_tmp[1024 * 1024];
+static int kvm_tmp_index;
+
+static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
+{
+	*inst = new_inst;
+	flush_icache_range((ulong)inst, (ulong)inst + 4);
+}
+
+static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
+}
+
+static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
+}
+
+static void kvm_patch_ins_nop(u32 *inst)
+{
+	kvm_patch_ins(inst, KVM_INST_NOP);
+}
+
+static void kvm_patch_ins_b(u32 *inst, int addr)
+{
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S)
+	/* On relocatable kernels interrupts handlers and our code
+	   can be in different regions, so we don't patch them */
+
+	if ((ulong)inst < (ulong)&__end_interrupts)
+		return;
+#endif
+
+	kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
+}
+
+static u32 *kvm_alloc(int len)
+{
+	u32 *p;
+
+	if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
+		printk(KERN_ERR "KVM: No more space (%d + %d)\n",
+				kvm_tmp_index, len);
+		kvm_patching_worked = false;
+		return NULL;
+	}
+
+	p = (void*)&kvm_tmp[kvm_tmp_index];
+	kvm_tmp_index += len;
+
+	return p;
+}
+
+extern u32 kvm_emulate_mtmsrd_branch_offs;
+extern u32 kvm_emulate_mtmsrd_reg_offs;
+extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
+extern u32 kvm_emulate_mtmsrd_len;
+extern u32 kvm_emulate_mtmsrd[];
+
+static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsrd_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4);
+	p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	switch (get_rt(rt)) {
+	case 30:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		break;
+	case 31:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		break;
+	default:
+		p[kvm_emulate_mtmsrd_reg_offs] |= rt;
+		break;
+	}
+
+	p[kvm_emulate_mtmsrd_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+extern u32 kvm_emulate_mtmsr_branch_offs;
+extern u32 kvm_emulate_mtmsr_reg1_offs;
+extern u32 kvm_emulate_mtmsr_reg2_offs;
+extern u32 kvm_emulate_mtmsr_orig_ins_offs;
+extern u32 kvm_emulate_mtmsr_len;
+extern u32 kvm_emulate_mtmsr[];
+
+static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsr_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4);
+	p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK;
+
+	/* Make clobbered registers work too */
+	switch (get_rt(rt)) {
+	case 30:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		break;
+	case 31:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		break;
+	default:
+		p[kvm_emulate_mtmsr_reg1_offs] |= rt;
+		p[kvm_emulate_mtmsr_reg2_offs] |= rt;
+		break;
+	}
+
+	p[kvm_emulate_mtmsr_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#ifdef CONFIG_BOOKE
+
+extern u32 kvm_emulate_wrtee_branch_offs;
+extern u32 kvm_emulate_wrtee_reg_offs;
+extern u32 kvm_emulate_wrtee_orig_ins_offs;
+extern u32 kvm_emulate_wrtee_len;
+extern u32 kvm_emulate_wrtee[];
+
+static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrtee_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4);
+	p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK;
+
+	if (imm_one) {
+		p[kvm_emulate_wrtee_reg_offs] =
+			KVM_INST_LI | __PPC_RT(R30) | MSR_EE;
+	} else {
+		/* Make clobbered registers work too */
+		switch (get_rt(rt)) {
+		case 30:
+			kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+					 magic_var(scratch2), KVM_RT_30);
+			break;
+		case 31:
+			kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+					 magic_var(scratch1), KVM_RT_30);
+			break;
+		default:
+			p[kvm_emulate_wrtee_reg_offs] |= rt;
+			break;
+		}
+	}
+
+	p[kvm_emulate_wrtee_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+extern u32 kvm_emulate_wrteei_0_branch_offs;
+extern u32 kvm_emulate_wrteei_0_len;
+extern u32 kvm_emulate_wrteei_0[];
+
+static void kvm_patch_ins_wrteei_0(u32 *inst)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrteei_0_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4);
+	p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+
+extern u32 kvm_emulate_mtsrin_branch_offs;
+extern u32 kvm_emulate_mtsrin_reg1_offs;
+extern u32 kvm_emulate_mtsrin_reg2_offs;
+extern u32 kvm_emulate_mtsrin_orig_ins_offs;
+extern u32 kvm_emulate_mtsrin_len;
+extern u32 kvm_emulate_mtsrin[];
+
+static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtsrin_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4);
+	p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10);
+	p[kvm_emulate_mtsrin_reg2_offs] |= rt;
+	p[kvm_emulate_mtsrin_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
+static void kvm_map_magic_page(void *data)
+{
+	u32 *features = data;
+
+	ulong in[8] = {0};
+	ulong out[8];
+
+	in[0] = KVM_MAGIC_PAGE;
+	in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX;
+
+	epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
+
+	*features = out[0];
+}
+
+static void kvm_check_ins(u32 *inst, u32 features)
+{
+	u32 _inst = *inst;
+	u32 inst_no_rt = _inst & ~KVM_MASK_RT;
+	u32 inst_rt = _inst & KVM_MASK_RT;
+
+	switch (inst_no_rt) {
+	/* Loads */
+	case KVM_INST_MFMSR:
+		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG0):
+		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG1):
+		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG2):
+		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG3):
+		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SRR0):
+		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SRR1):
+		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
+		break;
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_DEAR):
+#else
+	case KVM_INST_MFSPR(SPRN_DAR):
+#endif
+		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_DSISR):
+		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
+		break;
+
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	case KVM_INST_MFSPR(SPRN_MAS0):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS1):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS2):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS3):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_MAS7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt);
+		break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+	case KVM_INST_MFSPR(SPRN_SPRG4):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG4R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG5):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG5R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG6):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG6R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt);
+		break;
+	case KVM_INST_MFSPR(SPRN_SPRG7):
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_SPRG7R):
+#endif
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt);
+		break;
+
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MFSPR(SPRN_ESR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt);
+		break;
+#endif
+
+	case KVM_INST_MFSPR(SPRN_PIR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt);
+		break;
+
+
+	/* Stores */
+	case KVM_INST_MTSPR(SPRN_SPRG0):
+		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG1):
+		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG2):
+		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG3):
+		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SRR0):
+		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SRR1):
+		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
+		break;
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MTSPR(SPRN_DEAR):
+#else
+	case KVM_INST_MTSPR(SPRN_DAR):
+#endif
+		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_DSISR):
+		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
+		break;
+#ifdef CONFIG_PPC_BOOK3E_MMU
+	case KVM_INST_MTSPR(SPRN_MAS0):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS1):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS2):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(mas2), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS3):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_MAS7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt);
+		break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+	case KVM_INST_MTSPR(SPRN_SPRG4):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG5):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG6):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt);
+		break;
+	case KVM_INST_MTSPR(SPRN_SPRG7):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt);
+		break;
+
+#ifdef CONFIG_BOOKE
+	case KVM_INST_MTSPR(SPRN_ESR):
+		if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+			kvm_patch_ins_stw(inst, magic_var(esr), inst_rt);
+		break;
+#endif
+
+	/* Nops */
+	case KVM_INST_TLBSYNC:
+		kvm_patch_ins_nop(inst);
+		break;
+
+	/* Rewrites */
+	case KVM_INST_MTMSRD_L1:
+		kvm_patch_ins_mtmsrd(inst, inst_rt);
+		break;
+	case KVM_INST_MTMSR:
+	case KVM_INST_MTMSRD_L0:
+		kvm_patch_ins_mtmsr(inst, inst_rt);
+		break;
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEE:
+		kvm_patch_ins_wrtee(inst, inst_rt, 0);
+		break;
+#endif
+	}
+
+	switch (inst_no_rt & ~KVM_MASK_RB) {
+#ifdef CONFIG_PPC_BOOK3S_32
+	case KVM_INST_MTSRIN:
+		if (features & KVM_MAGIC_FEAT_SR) {
+			u32 inst_rb = _inst & KVM_MASK_RB;
+			kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb);
+		}
+		break;
+#endif
+	}
+
+	switch (_inst) {
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEEI_0:
+		kvm_patch_ins_wrteei_0(inst);
+		break;
+
+	case KVM_INST_WRTEEI_1:
+		kvm_patch_ins_wrtee(inst, 0, 1);
+		break;
+#endif
+	}
+}
+
+extern u32 kvm_template_start[];
+extern u32 kvm_template_end[];
+
+static void kvm_use_magic_page(void)
+{
+	u32 *p;
+	u32 *start, *end;
+	u32 features;
+
+	/* Tell the host to map the magic page to -4096 on all CPUs */
+	on_each_cpu(kvm_map_magic_page, &features, 1);
+
+	/* Quick self-test to see if the mapping works */
+	if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Now loop through all code and find instructions */
+	start = (void*)_stext;
+	end = (void*)_etext;
+
+	/*
+	 * Being interrupted in the middle of patching would
+	 * be bad for SPRG4-7, which KVM can't keep in sync
+	 * with emulated accesses because reads don't trap.
+	 */
+	local_irq_disable();
+
+	for (p = start; p < end; p++) {
+		/* Avoid patching the template code */
+		if (p >= kvm_template_start && p < kvm_template_end) {
+			p = kvm_template_end - 1;
+			continue;
+		}
+		kvm_check_ins(p, features);
+	}
+
+	local_irq_enable();
+
+	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
+			 kvm_patching_worked ? "worked" : "failed");
+}
+
+static __init void kvm_free_tmp(void)
+{
+	/*
+	 * Inform kmemleak about the hole in the .bss section since the
+	 * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y.
+	 */
+	kmemleak_free_part(&kvm_tmp[kvm_tmp_index],
+			   ARRAY_SIZE(kvm_tmp) - kvm_tmp_index);
+	free_reserved_area(&kvm_tmp[kvm_tmp_index],
+			   &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
+}
+
+static int __init kvm_guest_init(void)
+{
+	if (!kvm_para_available())
+		goto free_tmp;
+
+	if (!epapr_paravirt_enabled)
+		goto free_tmp;
+
+	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
+		kvm_use_magic_page();
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Enable napping */
+	powersave_nap = 1;
+#endif
+
+free_tmp:
+	kvm_free_tmp();
+
+	return 0;
+}
+
+postcore_initcall(kvm_guest_init);
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
new file mode 100644
index 000000000..c005088f6
--- /dev/null
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -0,0 +1,349 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#define KVM_MAGIC_PAGE		(-4096)
+
+#ifdef CONFIG_64BIT
+#define LL64(reg, offs, reg2)	ld	reg, (offs)(reg2)
+#define STL64(reg, offs, reg2)	std	reg, (offs)(reg2)
+#else
+#define LL64(reg, offs, reg2)	lwz	reg, (offs + 4)(reg2)
+#define STL64(reg, offs, reg2)	stw	reg, (offs + 4)(reg2)
+#endif
+
+#define SCRATCH_SAVE							\
+	/* Enable critical section. We are critical if			\
+	   shared->critical == r1 */					\
+	STL64(r1, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);		\
+									\
+	/* Save state */						\
+	PPC_STL	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0);		\
+	PPC_STL	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0);		\
+	mfcr	r31;							\
+	stw	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);
+
+#define SCRATCH_RESTORE							\
+	/* Restore state */						\
+	PPC_LL	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0);		\
+	lwz	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);		\
+	mtcr	r30;							\
+	PPC_LL	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0);		\
+									\
+	/* Disable critical section. We are critical if			\
+	   shared->critical == r1 and r2 is always != r1 */		\
+	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
+
+.global kvm_template_start
+kvm_template_start:
+
+.global kvm_emulate_mtmsrd
+kvm_emulate_mtmsrd:
+
+	SCRATCH_SAVE
+
+	/* Put MSR & ~(MSR_EE|MSR_RI) in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	lis	r30, (~(MSR_EE | MSR_RI))@h
+	ori	r30, r30, (~(MSR_EE | MSR_RI))@l
+	and	r31, r31, r30
+
+	/* OR the register's (MSR_EE|MSR_RI) on MSR */
+kvm_emulate_mtmsrd_reg:
+	ori	r30, r0, 0
+	andi.	r30, r30, (MSR_EE|MSR_RI)
+	or	r31, r31, r30
+
+	/* Put MSR back into magic page */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_check
+
+	/* Check if we may trigger an interrupt */
+	andi.	r30, r30, MSR_EE
+	beq	no_check
+
+	SCRATCH_RESTORE
+
+	/* Nag hypervisor */
+kvm_emulate_mtmsrd_orig_ins:
+	tlbsync
+
+	b	kvm_emulate_mtmsrd_branch
+
+no_check:
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsrd_branch:
+	b	.
+kvm_emulate_mtmsrd_end:
+
+.global kvm_emulate_mtmsrd_branch_offs
+kvm_emulate_mtmsrd_branch_offs:
+	.long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_reg_offs
+kvm_emulate_mtmsrd_reg_offs:
+	.long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_orig_ins_offs
+kvm_emulate_mtmsrd_orig_ins_offs:
+	.long (kvm_emulate_mtmsrd_orig_ins - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_len
+kvm_emulate_mtmsrd_len:
+	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
+
+
+#define MSR_SAFE_BITS (MSR_EE | MSR_RI)
+#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
+
+.global kvm_emulate_mtmsr
+kvm_emulate_mtmsr:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Find the changed bits between old and new MSR */
+kvm_emulate_mtmsr_reg1:
+	ori	r30, r0, 0
+	xor	r31, r30, r31
+
+	/* Check if we need to really do mtmsr */
+	LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS)
+	and.	r31, r31, r30
+
+	/* No critical bits changed? Maybe we can stay in the guest. */
+	beq	maybe_stay_in_guest
+
+do_mtmsr:
+
+	SCRATCH_RESTORE
+
+	/* Just fire off the mtmsr if it's critical */
+kvm_emulate_mtmsr_orig_ins:
+	mtmsr	r0
+
+	b	kvm_emulate_mtmsr_branch
+
+maybe_stay_in_guest:
+
+	/* Get the target register in r30 */
+kvm_emulate_mtmsr_reg2:
+	ori	r30, r0, 0
+
+	/* Put MSR into magic page because we don't call mtmsr */
+	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_mtmsr
+
+	/* Check if we may trigger an interrupt */
+	andi.	r31, r30, MSR_EE
+	bne	do_mtmsr
+
+no_mtmsr:
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsr_branch:
+	b	.
+kvm_emulate_mtmsr_end:
+
+.global kvm_emulate_mtmsr_branch_offs
+kvm_emulate_mtmsr_branch_offs:
+	.long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg1_offs
+kvm_emulate_mtmsr_reg1_offs:
+	.long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg2_offs
+kvm_emulate_mtmsr_reg2_offs:
+	.long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_orig_ins_offs
+kvm_emulate_mtmsr_orig_ins_offs:
+	.long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_len
+kvm_emulate_mtmsr_len:
+	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
+
+/* also used for wrteei 1 */
+.global kvm_emulate_wrtee
+kvm_emulate_wrtee:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Insert new MSR[EE] */
+kvm_emulate_wrtee_reg:
+	ori	r30, r0, 0
+	rlwimi	r31, r30, 0, MSR_EE
+
+	/*
+	 * If MSR[EE] is now set, check for a pending interrupt.
+	 * We could skip this if MSR[EE] was already on, but that
+	 * should be rare, so don't bother.
+	 */
+	andi.	r30, r30, MSR_EE
+
+	/* Put MSR into magic page because we don't call wrtee */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	beq	no_wrtee
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r30, 0
+	bne	do_wrtee
+
+no_wrtee:
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrtee_branch:
+	b	.
+
+do_wrtee:
+	SCRATCH_RESTORE
+
+	/* Just fire off the wrtee if it's critical */
+kvm_emulate_wrtee_orig_ins:
+	wrtee	r0
+
+	b	kvm_emulate_wrtee_branch
+
+kvm_emulate_wrtee_end:
+
+.global kvm_emulate_wrtee_branch_offs
+kvm_emulate_wrtee_branch_offs:
+	.long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_reg_offs
+kvm_emulate_wrtee_reg_offs:
+	.long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_orig_ins_offs
+kvm_emulate_wrtee_orig_ins_offs:
+	.long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_len
+kvm_emulate_wrtee_len:
+	.long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrteei_0
+kvm_emulate_wrteei_0:
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Remove MSR_EE from old MSR */
+	rlwinm	r31, r31, 0, ~MSR_EE
+
+	/* Write new MSR value back */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrteei_0_branch:
+	b	.
+kvm_emulate_wrteei_0_end:
+
+.global kvm_emulate_wrteei_0_branch_offs
+kvm_emulate_wrteei_0_branch_offs:
+	.long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4
+
+.global kvm_emulate_wrteei_0_len
+kvm_emulate_wrteei_0_len:
+	.long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
+
+.global kvm_emulate_mtsrin
+kvm_emulate_mtsrin:
+
+	SCRATCH_SAVE
+
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	andi.	r31, r31, MSR_DR | MSR_IR
+	beq	kvm_emulate_mtsrin_reg1
+
+	SCRATCH_RESTORE
+
+kvm_emulate_mtsrin_orig_ins:
+	nop
+	b	kvm_emulate_mtsrin_branch
+
+kvm_emulate_mtsrin_reg1:
+	/* rX >> 26 */
+	rlwinm  r30,r0,6,26,29
+
+kvm_emulate_mtsrin_reg2:
+	stw	r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtsrin_branch:
+	b	.
+kvm_emulate_mtsrin_end:
+
+.global kvm_emulate_mtsrin_branch_offs
+kvm_emulate_mtsrin_branch_offs:
+	.long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg1_offs
+kvm_emulate_mtsrin_reg1_offs:
+	.long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg2_offs
+kvm_emulate_mtsrin_reg2_offs:
+	.long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_orig_ins_offs
+kvm_emulate_mtsrin_orig_ins_offs:
+	.long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_len
+kvm_emulate_mtsrin_len:
+	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
+
+.global kvm_template_end
+kvm_template_end:
diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S
new file mode 100644
index 000000000..9d4b42d11
--- /dev/null
+++ b/arch/powerpc/kernel/l2cr_6xx.S
@@ -0,0 +1,471 @@
+/*
+	L2CR functions
+	Copyright © 1997-1998 by PowerLogix R & D, Inc.
+
+	This program is free software; you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation; either version 2 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to the Free Software
+	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+/*
+	Thur, Dec. 12, 1998.
+	- First public release, contributed by PowerLogix.
+	***********
+	Sat, Aug. 7, 1999.
+	- Terry: Made sure code disabled interrupts before running. (Previously
+			it was assumed interrupts were already disabled).
+	- Terry: Updated for tentative G4 support.  4MB of memory is now flushed
+			instead of 2MB.  (Prob. only 3 is necessary).
+	- Terry: Updated for workaround to HID0[DPM] processor bug
+			during global invalidates.
+	***********
+	Thu, July 13, 2000.
+	- Terry: Added isync to correct for an errata.
+
+	22 August 2001.
+	- DanM: Finally added the 7450 patch I've had for the past
+		several months.  The L2CR is similar, but I'm going
+		to assume the user of this functions knows what they
+		are doing.
+
+	Author:	Terry Greeniaus (tgree@phys.ualberta.ca)
+	Please e-mail updates to this file to me, thanks!
+*/
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/cache.h>
+#include <asm/page.h>
+#include <asm/feature-fixups.h>
+
+/* Usage:
+
+	When setting the L2CR register, you must do a few special
+	things.  If you are enabling the cache, you must perform a
+	global invalidate.  If you are disabling the cache, you must
+	flush the cache contents first.  This routine takes care of
+	doing these things.  When first enabling the cache, make sure
+	you pass in the L2CR you want, as well as passing in the
+	global invalidate bit set.  A global invalidate will only be
+	performed if the L2I bit is set in applyThis.  When enabling
+	the cache, you should also set the L2E bit in applyThis.  If
+	you want to modify the L2CR contents after the cache has been
+	enabled, the recommended procedure is to first call
+	__setL2CR(0) to disable the cache and then call it again with
+	the new values for L2CR.  Examples:
+
+	_setL2CR(0)		- disables the cache
+	_setL2CR(0xB3A04000)	- enables my G3 upgrade card:
+				- L2E set to turn on the cache
+				- L2SIZ set to 1MB
+				- L2CLK set to 1:1
+				- L2RAM set to pipelined synchronous late-write
+				- L2I set to perform a global invalidation
+				- L2OH set to 0.5 nS
+				- L2DF set because this upgrade card
+				  requires it
+
+	A similar call should work for your card.  You need to know
+	the correct setting for your card and then place them in the
+	fields I have outlined above.  Other fields support optional
+	features, such as L2DO which caches only data, or L2TS which
+	causes cache pushes from the L1 cache to go to the L2 cache
+	instead of to main memory.
+
+IMPORTANT:
+	Starting with the 7450, the bits in this register have moved
+	or behave differently.  The Enable, Parity Enable, Size,
+	and L2 Invalidate are the only bits that have not moved.
+	The size is read-only for these processors with internal L2
+	cache, and the invalidate is a control as well as status.
+		-- Dan
+
+*/
+/*
+ * Summary: this procedure ignores the L2I bit in the value passed in,
+ * flushes the cache if it was already enabled, always invalidates the
+ * cache, then enables the cache if the L2E bit is set in the value
+ * passed in.
+ *   -- paulus.
+ */
+_GLOBAL(_set_L2CR)
+	/* Make sure this is a 750 or 7400 chip */
+BEGIN_FTR_SECTION
+	li	r3,-1
+	blr
+END_FTR_SECTION_IFCLR(CPU_FTR_L2CR)
+
+	mflr	r9
+
+	/* Stop DST streams */
+BEGIN_FTR_SECTION
+	PPC_DSSALL
+	sync
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+
+	/* Turn off interrupts and data relocation. */
+	mfmsr	r7		/* Save MSR in r7 */
+	rlwinm	r4,r7,0,17,15
+	rlwinm	r4,r4,0,28,26	/* Turn off DR bit */
+	sync
+	mtmsr	r4
+	isync
+
+	/* Before we perform the global invalidation, we must disable dynamic
+	 * power management via HID0[DPM] to work around a processor bug where
+	 * DPM can possibly interfere with the state machine in the processor
+	 * that invalidates the L2 cache tags.
+	 */
+	mfspr	r8,SPRN_HID0		/* Save HID0 in r8 */
+	rlwinm	r4,r8,0,12,10		/* Turn off HID0[DPM] */
+	sync
+	mtspr	SPRN_HID0,r4		/* Disable DPM */
+	sync
+
+	/* Get the current enable bit of the L2CR into r4 */
+	mfspr	r4,SPRN_L2CR
+
+	/* Tweak some bits */
+	rlwinm	r5,r3,0,0,0		/* r5 contains the new enable bit */
+	rlwinm	r3,r3,0,11,9		/* Turn off the invalidate bit */
+	rlwinm	r3,r3,0,1,31		/* Turn off the enable bit */
+
+	/* Check to see if we need to flush */
+	rlwinm.	r4,r4,0,0,0
+	beq	2f
+
+	/* Flush the cache. First, read the first 4MB of memory (physical) to
+	 * put new data in the cache.  (Actually we only need
+	 * the size of the L2 cache plus the size of the L1 cache, but 4MB will
+	 * cover everything just to be safe).
+	 */
+
+	 /**** Might be a good idea to set L2DO here - to prevent instructions
+	       from getting into the cache.  But since we invalidate
+	       the next time we enable the cache it doesn't really matter.
+	       Don't do this unless you accommodate all processor variations.
+	       The bit moved on the 7450.....
+	  ****/
+
+BEGIN_FTR_SECTION
+	/* Disable L2 prefetch on some 745x and try to ensure
+	 * L2 prefetch engines are idle. As explained by errata
+	 * text, we can't be sure they are, we just hope very hard
+	 * that well be enough (sic !). At least I noticed Apple
+	 * doesn't even bother doing the dcbf's here...
+	 */
+	mfspr	r4,SPRN_MSSCR0
+	rlwinm	r4,r4,0,0,29
+	sync
+	mtspr	SPRN_MSSCR0,r4
+	sync
+	isync
+	lis	r4,KERNELBASE@h
+	dcbf	0,r4
+	dcbf	0,r4
+	dcbf	0,r4
+	dcbf	0,r4
+END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450)
+
+	/* TODO: use HW flush assist when available */
+
+	lis	r4,0x0002
+	mtctr	r4
+	li	r4,0
+1:
+	lwzx	r0,0,r4
+	addi	r4,r4,32		/* Go to start of next cache line */
+	bdnz	1b
+	isync
+
+	/* Now, flush the first 4MB of memory */
+	lis	r4,0x0002
+	mtctr	r4
+	li	r4,0
+	sync
+1:
+	dcbf	0,r4
+	addi	r4,r4,32		/* Go to start of next cache line */
+	bdnz	1b
+
+2:
+	/* Set up the L2CR configuration bits (and switch L2 off) */
+	/* CPU errata: Make sure the mtspr below is already in the
+	 * L1 icache
+	 */
+	b	20f
+	.balign	L1_CACHE_BYTES
+22:
+	sync
+	mtspr	SPRN_L2CR,r3
+	sync
+	b	23f
+20:
+	b	21f
+21:	sync
+	isync
+	b	22b
+
+23:
+	/* Perform a global invalidation */
+	oris	r3,r3,0x0020
+	sync
+	mtspr	SPRN_L2CR,r3
+	sync
+	isync				/* For errata */
+
+BEGIN_FTR_SECTION
+	/* On the 7450, we wait for the L2I bit to clear......
+	*/
+10:	mfspr	r3,SPRN_L2CR
+	andis.	r4,r3,0x0020
+	bne	10b
+	b	11f
+END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450)
+
+	/* Wait for the invalidation to complete */
+3:	mfspr	r3,SPRN_L2CR
+	rlwinm.	r4,r3,0,31,31
+	bne	3b
+
+11:	rlwinm	r3,r3,0,11,9		/* Turn off the L2I bit */
+	sync
+	mtspr	SPRN_L2CR,r3
+	sync
+
+	/* See if we need to enable the cache */
+	cmplwi	r5,0
+	beq	4f
+
+	/* Enable the cache */
+	oris	r3,r3,0x8000
+	mtspr	SPRN_L2CR,r3
+	sync
+	
+	/* Enable L2 HW prefetch on 744x/745x */
+BEGIN_FTR_SECTION
+	mfspr	r3,SPRN_MSSCR0
+	ori	r3,r3,3
+	sync
+	mtspr	SPRN_MSSCR0,r3
+	sync
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450)
+4:
+
+	/* Restore HID0[DPM] to whatever it was before */
+	sync
+	mtspr	1008,r8
+	sync
+
+	/* Restore MSR (restores EE and DR bits to original state) */
+	SYNC
+	mtmsr	r7
+	isync
+
+	mtlr	r9
+	blr
+
+_GLOBAL(_get_L2CR)
+	/* Return the L2CR contents */
+	li	r3,0
+BEGIN_FTR_SECTION
+	mfspr	r3,SPRN_L2CR
+END_FTR_SECTION_IFSET(CPU_FTR_L2CR)
+	blr
+
+
+/*
+ * Here is a similar routine for dealing with the L3 cache
+ * on the 745x family of chips
+ */
+
+_GLOBAL(_set_L3CR)
+	/* Make sure this is a 745x chip */
+BEGIN_FTR_SECTION
+	li	r3,-1
+	blr
+END_FTR_SECTION_IFCLR(CPU_FTR_L3CR)
+
+	/* Turn off interrupts and data relocation. */
+	mfmsr	r7		/* Save MSR in r7 */
+	rlwinm	r4,r7,0,17,15
+	rlwinm	r4,r4,0,28,26	/* Turn off DR bit */
+	sync
+	mtmsr	r4
+	isync
+
+	/* Stop DST streams */
+	PPC_DSSALL
+	sync
+
+	/* Get the current enable bit of the L3CR into r4 */
+	mfspr	r4,SPRN_L3CR
+
+	/* Tweak some bits */
+	rlwinm	r5,r3,0,0,0		/* r5 contains the new enable bit */
+	rlwinm	r3,r3,0,22,20		/* Turn off the invalidate bit */
+	rlwinm	r3,r3,0,2,31		/* Turn off the enable & PE bits */
+	rlwinm	r3,r3,0,5,3		/* Turn off the clken bit */
+	/* Check to see if we need to flush */
+	rlwinm.	r4,r4,0,0,0
+	beq	2f
+
+	/* Flush the cache.
+	 */
+
+	/* TODO: use HW flush assist */
+
+	lis	r4,0x0008
+	mtctr	r4
+	li	r4,0
+1:
+	lwzx	r0,0,r4
+	dcbf	0,r4
+	addi	r4,r4,32		/* Go to start of next cache line */
+	bdnz	1b
+
+2:
+	/* Set up the L3CR configuration bits (and switch L3 off) */
+	sync
+	mtspr	SPRN_L3CR,r3
+	sync
+
+	oris	r3,r3,L3CR_L3RES@h		/* Set reserved bit 5 */
+	mtspr	SPRN_L3CR,r3
+	sync
+	oris	r3,r3,L3CR_L3CLKEN@h		/* Set clken */
+	mtspr	SPRN_L3CR,r3
+	sync
+
+	/* Wait for stabilize */
+	li	r0,256
+	mtctr	r0
+1:	bdnz	1b
+
+	/* Perform a global invalidation */
+	ori	r3,r3,0x0400
+	sync
+	mtspr	SPRN_L3CR,r3
+	sync
+	isync
+
+	/* We wait for the L3I bit to clear...... */
+10:	mfspr	r3,SPRN_L3CR
+	andi.	r4,r3,0x0400
+	bne	10b
+
+	/* Clear CLKEN */
+	rlwinm	r3,r3,0,5,3		/* Turn off the clken bit */
+	mtspr	SPRN_L3CR,r3
+	sync
+
+	/* Wait for stabilize */
+	li	r0,256
+	mtctr	r0
+1:	bdnz	1b
+
+	/* See if we need to enable the cache */
+	cmplwi	r5,0
+	beq	4f
+
+	/* Enable the cache */
+	oris	r3,r3,(L3CR_L3E | L3CR_L3CLKEN)@h
+	mtspr	SPRN_L3CR,r3
+	sync
+
+	/* Wait for stabilize */
+	li	r0,256
+	mtctr	r0
+1:	bdnz	1b
+
+	/* Restore MSR (restores EE and DR bits to original state) */
+4:	SYNC
+	mtmsr	r7
+	isync
+	blr
+
+_GLOBAL(_get_L3CR)
+	/* Return the L3CR contents */
+	li	r3,0
+BEGIN_FTR_SECTION
+	mfspr	r3,SPRN_L3CR
+END_FTR_SECTION_IFSET(CPU_FTR_L3CR)
+	blr
+
+/* --- End of PowerLogix code ---
+ */
+
+
+/* flush_disable_L1()	- Flush and disable L1 cache
+ *
+ * clobbers r0, r3, ctr, cr0
+ * Must be called with interrupts disabled and MMU enabled.
+ */
+_GLOBAL(__flush_disable_L1)
+	/* Stop pending alitvec streams and memory accesses */
+BEGIN_FTR_SECTION
+	PPC_DSSALL
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+ 	sync
+
+	/* Load counter to 0x4000 cache lines (512k) and
+	 * load cache with datas
+	 */
+	li	r3,0x4000	/* 512kB / 32B */
+	mtctr	r3
+	lis	r3,KERNELBASE@h
+1:
+	lwz	r0,0(r3)
+	addi	r3,r3,0x0020	/* Go to start of next cache line */
+	bdnz	1b
+	isync
+	sync
+
+	/* Now flush those cache lines */
+	li	r3,0x4000	/* 512kB / 32B */
+	mtctr	r3
+	lis	r3,KERNELBASE@h
+1:
+	dcbf	0,r3
+	addi	r3,r3,0x0020	/* Go to start of next cache line */
+	bdnz	1b
+	sync
+
+	/* We can now disable the L1 cache (HID0:DCE, HID0:ICE) */
+	mfspr	r3,SPRN_HID0
+	rlwinm	r3,r3,0,18,15
+	mtspr	SPRN_HID0,r3
+	sync
+	isync
+ 	blr
+
+/* inval_enable_L1	- Invalidate and enable L1 cache
+ *
+ * Assumes L1 is already disabled and MSR:EE is off
+ *
+ * clobbers r3
+ */
+_GLOBAL(__inval_enable_L1)
+	/* Enable and then Flash inval the instruction & data cache */
+	mfspr	r3,SPRN_HID0
+	ori	r3,r3, HID0_ICE|HID0_ICFI|HID0_DCE|HID0_DCI
+	sync
+	isync
+	mtspr	SPRN_HID0,r3
+	xori	r3,r3, HID0_ICFI|HID0_DCI
+	mtspr	SPRN_HID0,r3
+	sync
+
+ 	blr
+
+
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
new file mode 100644
index 000000000..5b9dce17f
--- /dev/null
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/serial.h>
+#include <linux/serial_8250.h>
+#include <linux/serial_core.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/serial_reg.h>
+#include <asm/io.h>
+#include <asm/mmu.h>
+#include <asm/prom.h>
+#include <asm/serial.h>
+#include <asm/udbg.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(fmt...) do { printk(fmt); } while(0)
+#else
+#define DBG(fmt...) do { } while(0)
+#endif
+
+#define MAX_LEGACY_SERIAL_PORTS	8
+
+static struct plat_serial8250_port
+legacy_serial_ports[MAX_LEGACY_SERIAL_PORTS+1];
+static struct legacy_serial_info {
+	struct device_node		*np;
+	unsigned int			speed;
+	unsigned int			clock;
+	int				irq_check_parent;
+	phys_addr_t			taddr;
+} legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS];
+
+static const struct of_device_id legacy_serial_parents[] __initconst = {
+	{.type = "soc",},
+	{.type = "tsi-bridge",},
+	{.type = "opb", },
+	{.compatible = "ibm,opb",},
+	{.compatible = "simple-bus",},
+	{.compatible = "wrs,epld-localbus",},
+	{},
+};
+
+static unsigned int legacy_serial_count;
+static int legacy_serial_console = -1;
+
+static const upf_t legacy_port_flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
+	UPF_SHARE_IRQ | UPF_FIXED_PORT;
+
+static unsigned int tsi_serial_in(struct uart_port *p, int offset)
+{
+	unsigned int tmp;
+	offset = offset << p->regshift;
+	if (offset == UART_IIR) {
+		tmp = readl(p->membase + (UART_IIR & ~3));
+		return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */
+	} else
+		return readb(p->membase + offset);
+}
+
+static void tsi_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = offset << p->regshift;
+	if (!((offset == UART_IER) && (value & UART_IER_UUE)))
+		writeb(value, p->membase + offset);
+}
+
+static int __init add_legacy_port(struct device_node *np, int want_index,
+				  int iotype, phys_addr_t base,
+				  phys_addr_t taddr, unsigned long irq,
+				  upf_t flags, int irq_check_parent)
+{
+	const __be32 *clk, *spd, *rs;
+	u32 clock = BASE_BAUD * 16;
+	u32 shift = 0;
+	int index;
+
+	/* get clock freq. if present */
+	clk = of_get_property(np, "clock-frequency", NULL);
+	if (clk && *clk)
+		clock = be32_to_cpup(clk);
+
+	/* get default speed if present */
+	spd = of_get_property(np, "current-speed", NULL);
+
+	/* get register shift if present */
+	rs = of_get_property(np, "reg-shift", NULL);
+	if (rs && *rs)
+		shift = be32_to_cpup(rs);
+
+	/* If we have a location index, then try to use it */
+	if (want_index >= 0 && want_index < MAX_LEGACY_SERIAL_PORTS)
+		index = want_index;
+	else
+		index = legacy_serial_count;
+
+	/* if our index is still out of range, that mean that
+	 * array is full, we could scan for a free slot but that
+	 * make little sense to bother, just skip the port
+	 */
+	if (index >= MAX_LEGACY_SERIAL_PORTS)
+		return -1;
+	if (index >= legacy_serial_count)
+		legacy_serial_count = index + 1;
+
+	/* Check if there is a port who already claimed our slot */
+	if (legacy_serial_infos[index].np != NULL) {
+		/* if we still have some room, move it, else override */
+		if (legacy_serial_count < MAX_LEGACY_SERIAL_PORTS) {
+			printk(KERN_DEBUG "Moved legacy port %d -> %d\n",
+			       index, legacy_serial_count);
+			legacy_serial_ports[legacy_serial_count] =
+				legacy_serial_ports[index];
+			legacy_serial_infos[legacy_serial_count] =
+				legacy_serial_infos[index];
+			legacy_serial_count++;
+		} else {
+			printk(KERN_DEBUG "Replacing legacy port %d\n", index);
+		}
+	}
+
+	/* Now fill the entry */
+	memset(&legacy_serial_ports[index], 0,
+	       sizeof(struct plat_serial8250_port));
+	if (iotype == UPIO_PORT)
+		legacy_serial_ports[index].iobase = base;
+	else
+		legacy_serial_ports[index].mapbase = base;
+
+	legacy_serial_ports[index].iotype = iotype;
+	legacy_serial_ports[index].uartclk = clock;
+	legacy_serial_ports[index].irq = irq;
+	legacy_serial_ports[index].flags = flags;
+	legacy_serial_ports[index].regshift = shift;
+	legacy_serial_infos[index].taddr = taddr;
+	legacy_serial_infos[index].np = of_node_get(np);
+	legacy_serial_infos[index].clock = clock;
+	legacy_serial_infos[index].speed = spd ? be32_to_cpup(spd) : 0;
+	legacy_serial_infos[index].irq_check_parent = irq_check_parent;
+
+	if (iotype == UPIO_TSI) {
+		legacy_serial_ports[index].serial_in = tsi_serial_in;
+		legacy_serial_ports[index].serial_out = tsi_serial_out;
+	}
+
+	printk(KERN_DEBUG "Found legacy serial port %d for %pOF\n",
+	       index, np);
+	printk(KERN_DEBUG "  %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n",
+	       (iotype == UPIO_PORT) ? "port" : "mem",
+	       (unsigned long long)base, (unsigned long long)taddr, irq,
+	       legacy_serial_ports[index].uartclk,
+	       legacy_serial_infos[index].speed);
+
+	return index;
+}
+
+static int __init add_legacy_soc_port(struct device_node *np,
+				      struct device_node *soc_dev)
+{
+	u64 addr;
+	const __be32 *addrp;
+	struct device_node *tsi = of_get_parent(np);
+
+	/* We only support ports that have a clock frequency properly
+	 * encoded in the device-tree.
+	 */
+	if (of_get_property(np, "clock-frequency", NULL) == NULL)
+		return -1;
+
+	/* if reg-offset don't try to use it */
+	if ((of_get_property(np, "reg-offset", NULL) != NULL))
+		return -1;
+
+	/* if rtas uses this device, don't try to use it as well */
+	if (of_get_property(np, "used-by-rtas", NULL) != NULL)
+		return -1;
+
+	/* Get the address */
+	addrp = of_get_address(soc_dev, 0, NULL, NULL);
+	if (addrp == NULL)
+		return -1;
+
+	addr = of_translate_address(soc_dev, addrp);
+	if (addr == OF_BAD_ADDR)
+		return -1;
+
+	/* Add port, irq will be dealt with later. We passed a translated
+	 * IO port value. It will be fixed up later along with the irq
+	 */
+	if (tsi && !strcmp(tsi->type, "tsi-bridge"))
+		return add_legacy_port(np, -1, UPIO_TSI, addr, addr,
+				       0, legacy_port_flags, 0);
+	else
+		return add_legacy_port(np, -1, UPIO_MEM, addr, addr,
+				       0, legacy_port_flags, 0);
+}
+
+static int __init add_legacy_isa_port(struct device_node *np,
+				      struct device_node *isa_brg)
+{
+	const __be32 *reg;
+	const char *typep;
+	int index = -1;
+	u64 taddr;
+
+	DBG(" -> add_legacy_isa_port(%pOF)\n", np);
+
+	/* Get the ISA port number */
+	reg = of_get_property(np, "reg", NULL);
+	if (reg == NULL)
+		return -1;
+
+	/* Verify it's an IO port, we don't support anything else */
+	if (!(be32_to_cpu(reg[0]) & 0x00000001))
+		return -1;
+
+	/* Now look for an "ibm,aix-loc" property that gives us ordering
+	 * if any...
+	 */
+	typep = of_get_property(np, "ibm,aix-loc", NULL);
+
+	/* If we have a location index, then use it */
+	if (typep && *typep == 'S')
+		index = simple_strtol(typep+1, NULL, 0) - 1;
+
+	/* Translate ISA address. If it fails, we still register the port
+	 * with no translated address so that it can be picked up as an IO
+	 * port later by the serial driver
+	 *
+	 * Note: Don't even try on P8 lpc, we know it's not directly mapped
+	 */
+	if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc") ||
+	    of_get_property(isa_brg, "ranges", NULL)) {
+		taddr = of_translate_address(np, reg);
+		if (taddr == OF_BAD_ADDR)
+			taddr = 0;
+	} else
+		taddr = 0;
+
+	/* Add port, irq will be dealt with later */
+	return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]),
+			       taddr, 0, legacy_port_flags, 0);
+
+}
+
+#ifdef CONFIG_PCI
+static int __init add_legacy_pci_port(struct device_node *np,
+				      struct device_node *pci_dev)
+{
+	u64 addr, base;
+	const __be32 *addrp;
+	unsigned int flags;
+	int iotype, index = -1, lindex = 0;
+
+	DBG(" -> add_legacy_pci_port(%pOF)\n", np);
+
+	/* We only support ports that have a clock frequency properly
+	 * encoded in the device-tree (that is have an fcode). Anything
+	 * else can't be used that early and will be normally probed by
+	 * the generic 8250_pci driver later on. The reason is that 8250
+	 * compatible UARTs on PCI need all sort of quirks (port offsets
+	 * etc...) that this code doesn't know about
+	 */
+	if (of_get_property(np, "clock-frequency", NULL) == NULL)
+		return -1;
+
+	/* Get the PCI address. Assume BAR 0 */
+	addrp = of_get_pci_address(pci_dev, 0, NULL, &flags);
+	if (addrp == NULL)
+		return -1;
+
+	/* We only support BAR 0 for now */
+	iotype = (flags & IORESOURCE_MEM) ? UPIO_MEM : UPIO_PORT;
+	addr = of_translate_address(pci_dev, addrp);
+	if (addr == OF_BAD_ADDR)
+		return -1;
+
+	/* Set the IO base to the same as the translated address for MMIO,
+	 * or to the domain local IO base for PIO (it will be fixed up later)
+	 */
+	if (iotype == UPIO_MEM)
+		base = addr;
+	else
+		base = of_read_number(&addrp[2], 1);
+
+	/* Try to guess an index... If we have subdevices of the pci dev,
+	 * we get to their "reg" property
+	 */
+	if (np != pci_dev) {
+		const __be32 *reg = of_get_property(np, "reg", NULL);
+		if (reg && (be32_to_cpup(reg) < 4))
+			index = lindex = be32_to_cpup(reg);
+	}
+
+	/* Local index means it's the Nth port in the PCI chip. Unfortunately
+	 * the offset to add here is device specific. We know about those
+	 * EXAR ports and we default to the most common case. If your UART
+	 * doesn't work for these settings, you'll have to add your own special
+	 * cases here
+	 */
+	if (of_device_is_compatible(pci_dev, "pci13a8,152") ||
+	    of_device_is_compatible(pci_dev, "pci13a8,154") ||
+	    of_device_is_compatible(pci_dev, "pci13a8,158")) {
+		addr += 0x200 * lindex;
+		base += 0x200 * lindex;
+	} else {
+		addr += 8 * lindex;
+		base += 8 * lindex;
+	}
+
+	/* Add port, irq will be dealt with later. We passed a translated
+	 * IO port value. It will be fixed up later along with the irq
+	 */
+	return add_legacy_port(np, index, iotype, base, addr, 0,
+			       legacy_port_flags, np != pci_dev);
+}
+#endif
+
+static void __init setup_legacy_serial_console(int console)
+{
+	struct legacy_serial_info *info = &legacy_serial_infos[console];
+	struct plat_serial8250_port *port = &legacy_serial_ports[console];
+	void __iomem *addr;
+	unsigned int stride;
+
+	stride = 1 << port->regshift;
+
+	/* Check if a translated MMIO address has been found */
+	if (info->taddr) {
+		addr = ioremap(info->taddr, 0x1000);
+		if (addr == NULL)
+			return;
+		udbg_uart_init_mmio(addr, stride);
+	} else {
+		/* Check if it's PIO and we support untranslated PIO */
+		if (port->iotype == UPIO_PORT && isa_io_special)
+			udbg_uart_init_pio(port->iobase, stride);
+		else
+			return;
+	}
+
+	/* Try to query the current speed */
+	if (info->speed == 0)
+		info->speed = udbg_probe_uart_speed(info->clock);
+
+	/* Set it up */
+	DBG("default console speed = %d\n", info->speed);
+	udbg_uart_setup(info->speed, info->clock);
+}
+
+/*
+ * This is called very early, as part of setup_system() or eventually
+ * setup_arch(), basically before anything else in this file. This function
+ * will try to build a list of all the available 8250-compatible serial ports
+ * in the machine using the Open Firmware device-tree. It currently only deals
+ * with ISA and PCI busses but could be extended. It allows a very early boot
+ * console to be initialized, that list is also used later to provide 8250 with
+ * the machine non-PCI ports and to properly pick the default console port
+ */
+void __init find_legacy_serial_ports(void)
+{
+	struct device_node *np, *stdout = NULL;
+	const char *path;
+	int index;
+
+	DBG(" -> find_legacy_serial_port()\n");
+
+	/* Now find out if one of these is out firmware console */
+	path = of_get_property(of_chosen, "linux,stdout-path", NULL);
+	if (path == NULL)
+		path = of_get_property(of_chosen, "stdout-path", NULL);
+	if (path != NULL) {
+		stdout = of_find_node_by_path(path);
+		if (stdout)
+			DBG("stdout is %pOF\n", stdout);
+	} else {
+		DBG(" no linux,stdout-path !\n");
+	}
+
+	/* Iterate over all the 16550 ports, looking for known parents */
+	for_each_compatible_node(np, "serial", "ns16550") {
+		struct device_node *parent = of_get_parent(np);
+		if (!parent)
+			continue;
+		if (of_match_node(legacy_serial_parents, parent) != NULL) {
+			if (of_device_is_available(np)) {
+				index = add_legacy_soc_port(np, np);
+				if (index >= 0 && np == stdout)
+					legacy_serial_console = index;
+			}
+		}
+		of_node_put(parent);
+	}
+
+	/* Next, fill our array with ISA ports */
+	for_each_node_by_type(np, "serial") {
+		struct device_node *isa = of_get_parent(np);
+		if (isa && (!strcmp(isa->name, "isa") ||
+			    !strcmp(isa->name, "lpc"))) {
+			if (of_device_is_available(np)) {
+				index = add_legacy_isa_port(np, isa);
+				if (index >= 0 && np == stdout)
+					legacy_serial_console = index;
+			}
+		}
+		of_node_put(isa);
+	}
+
+#ifdef CONFIG_PCI
+	/* Next, try to locate PCI ports */
+	for (np = NULL; (np = of_find_all_nodes(np));) {
+		struct device_node *pci, *parent = of_get_parent(np);
+		if (parent && !strcmp(parent->name, "isa")) {
+			of_node_put(parent);
+			continue;
+		}
+		if (strcmp(np->name, "serial") && strcmp(np->type, "serial")) {
+			of_node_put(parent);
+			continue;
+		}
+		/* Check for known pciclass, and also check whether we have
+		 * a device with child nodes for ports or not
+		 */
+		if (of_device_is_compatible(np, "pciclass,0700") ||
+		    of_device_is_compatible(np, "pciclass,070002"))
+			pci = np;
+		else if (of_device_is_compatible(parent, "pciclass,0700") ||
+			 of_device_is_compatible(parent, "pciclass,070002"))
+			pci = parent;
+		else {
+			of_node_put(parent);
+			continue;
+		}
+		index = add_legacy_pci_port(np, pci);
+		if (index >= 0 && np == stdout)
+			legacy_serial_console = index;
+		of_node_put(parent);
+	}
+#endif
+
+	DBG("legacy_serial_console = %d\n", legacy_serial_console);
+	if (legacy_serial_console >= 0)
+		setup_legacy_serial_console(legacy_serial_console);
+	DBG(" <- find_legacy_serial_port()\n");
+}
+
+static struct platform_device serial_device = {
+	.name	= "serial8250",
+	.id	= PLAT8250_DEV_PLATFORM,
+	.dev	= {
+		.platform_data = legacy_serial_ports,
+	},
+};
+
+static void __init fixup_port_irq(int index,
+				  struct device_node *np,
+				  struct plat_serial8250_port *port)
+{
+	unsigned int virq;
+
+	DBG("fixup_port_irq(%d)\n", index);
+
+	virq = irq_of_parse_and_map(np, 0);
+	if (!virq && legacy_serial_infos[index].irq_check_parent) {
+		np = of_get_parent(np);
+		if (np == NULL)
+			return;
+		virq = irq_of_parse_and_map(np, 0);
+		of_node_put(np);
+	}
+	if (!virq)
+		return;
+
+	port->irq = virq;
+
+#ifdef CONFIG_SERIAL_8250_FSL
+	if (of_device_is_compatible(np, "fsl,ns16550"))
+		port->handle_irq = fsl8250_handle_irq;
+#endif
+}
+
+static void __init fixup_port_pio(int index,
+				  struct device_node *np,
+				  struct plat_serial8250_port *port)
+{
+#ifdef CONFIG_PCI
+	struct pci_controller *hose;
+
+	DBG("fixup_port_pio(%d)\n", index);
+
+	hose = pci_find_hose_for_OF_device(np);
+	if (hose) {
+		unsigned long offset = (unsigned long)hose->io_base_virt -
+#ifdef CONFIG_PPC64
+			pci_io_base;
+#else
+			isa_io_base;
+#endif
+		DBG("port %d, IO %lx -> %lx\n",
+		    index, port->iobase, port->iobase + offset);
+		port->iobase += offset;
+	}
+#endif
+}
+
+static void __init fixup_port_mmio(int index,
+				   struct device_node *np,
+				   struct plat_serial8250_port *port)
+{
+	DBG("fixup_port_mmio(%d)\n", index);
+
+	port->membase = ioremap(port->mapbase, 0x100);
+}
+
+/*
+ * This is called as an arch initcall, hopefully before the PCI bus is
+ * probed and/or the 8250 driver loaded since we need to register our
+ * platform devices before 8250 PCI ones are detected as some of them
+ * must properly "override" the platform ones.
+ *
+ * This function fixes up the interrupt value for platform ports as it
+ * couldn't be done earlier before interrupt maps have been parsed. It
+ * also "corrects" the IO address for PIO ports for the same reason,
+ * since earlier, the PHBs virtual IO space wasn't assigned yet. It then
+ * registers all those platform ports for use by the 8250 driver when it
+ * finally loads.
+ */
+static int __init serial_dev_init(void)
+{
+	int i;
+
+	if (legacy_serial_count == 0)
+		return -ENODEV;
+
+	/*
+	 * Before we register the platform serial devices, we need
+	 * to fixup their interrupts and their IO ports.
+	 */
+	DBG("Fixing serial ports interrupts and IO ports ...\n");
+
+	for (i = 0; i < legacy_serial_count; i++) {
+		struct plat_serial8250_port *port = &legacy_serial_ports[i];
+		struct device_node *np = legacy_serial_infos[i].np;
+
+		if (!port->irq)
+			fixup_port_irq(i, np, port);
+		if (port->iotype == UPIO_PORT)
+			fixup_port_pio(i, np, port);
+		if ((port->iotype == UPIO_MEM) || (port->iotype == UPIO_TSI))
+			fixup_port_mmio(i, np, port);
+	}
+
+	DBG("Registering platform serial ports\n");
+
+	return platform_device_register(&serial_device);
+}
+device_initcall(serial_dev_init);
+
+
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+/*
+ * This is called very early, as part of console_init() (typically just after
+ * time_init()). This function is respondible for trying to find a good
+ * default console on serial ports. It tries to match the open firmware
+ * default output with one of the available serial console drivers that have
+ * been probed earlier by find_legacy_serial_ports()
+ */
+static int __init check_legacy_serial_console(void)
+{
+	struct device_node *prom_stdout = NULL;
+	int i, speed = 0, offset = 0;
+	const char *name;
+	const __be32 *spd;
+
+	DBG(" -> check_legacy_serial_console()\n");
+
+	/* The user has requested a console so this is already set up. */
+	if (strstr(boot_command_line, "console=")) {
+		DBG(" console was specified !\n");
+		return -EBUSY;
+	}
+
+	if (!of_chosen) {
+		DBG(" of_chosen is NULL !\n");
+		return -ENODEV;
+	}
+
+	if (legacy_serial_console < 0) {
+		DBG(" legacy_serial_console not found !\n");
+		return -ENODEV;
+	}
+	/* We are getting a weird phandle from OF ... */
+	/* ... So use the full path instead */
+	name = of_get_property(of_chosen, "linux,stdout-path", NULL);
+	if (name == NULL)
+		name = of_get_property(of_chosen, "stdout-path", NULL);
+	if (name == NULL) {
+		DBG(" no stdout-path !\n");
+		return -ENODEV;
+	}
+	prom_stdout = of_find_node_by_path(name);
+	if (!prom_stdout) {
+		DBG(" can't find stdout package %s !\n", name);
+		return -ENODEV;
+	}
+	DBG("stdout is %pOF\n", prom_stdout);
+
+	name = of_get_property(prom_stdout, "name", NULL);
+	if (!name) {
+		DBG(" stdout package has no name !\n");
+		goto not_found;
+	}
+	spd = of_get_property(prom_stdout, "current-speed", NULL);
+	if (spd)
+		speed = be32_to_cpup(spd);
+
+	if (strcmp(name, "serial") != 0)
+		goto not_found;
+
+	/* Look for it in probed array */
+	for (i = 0; i < legacy_serial_count; i++) {
+		if (prom_stdout != legacy_serial_infos[i].np)
+			continue;
+		offset = i;
+		speed = legacy_serial_infos[i].speed;
+		break;
+	}
+	if (i >= legacy_serial_count)
+		goto not_found;
+
+	of_node_put(prom_stdout);
+
+	DBG("Found serial console at ttyS%d\n", offset);
+
+	if (speed) {
+		static char __initdata opt[16];
+		sprintf(opt, "%d", speed);
+		return add_preferred_console("ttyS", offset, opt);
+	} else
+		return add_preferred_console("ttyS", offset, NULL);
+
+ not_found:
+	DBG("No preferred console found !\n");
+	of_node_put(prom_stdout);
+	return -ENODEV;
+}
+console_initcall(check_legacy_serial_console);
+
+#endif /* CONFIG_SERIAL_8250_CONSOLE */
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
new file mode 100644
index 000000000..437c50bfe
--- /dev/null
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -0,0 +1,290 @@
+/*
+ * Code to handle transition of Linux booting another kernel.
+ *
+ * Copyright (C) 2002-2003 Eric Biederman  <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * Copyright (C) 2005 IBM Corporation.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/kexec.h>
+#include <linux/reboot.h>
+#include <linux/threads.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/irq.h>
+#include <linux/ftrace.h>
+
+#include <asm/kdump.h>
+#include <asm/machdep.h>
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/sections.h>
+
+void machine_kexec_mask_interrupts(void) {
+	unsigned int i;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(i, desc) {
+		struct irq_chip *chip;
+
+		chip = irq_desc_get_chip(desc);
+		if (!chip)
+			continue;
+
+		if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+			chip->irq_eoi(&desc->irq_data);
+
+		if (chip->irq_mask)
+			chip->irq_mask(&desc->irq_data);
+
+		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+			chip->irq_disable(&desc->irq_data);
+	}
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	default_machine_crash_shutdown(regs);
+}
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	if (ppc_md.machine_kexec_prepare)
+		return ppc_md.machine_kexec_prepare(image);
+	else
+		return default_machine_kexec_prepare(image);
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(node_data);
+	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+	VMCOREINFO_SYMBOL(vmemmap_list);
+	VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+	VMCOREINFO_SYMBOL(mmu_psize_defs);
+	VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+	VMCOREINFO_OFFSET(vmemmap_backing, list);
+	VMCOREINFO_OFFSET(vmemmap_backing, phys);
+	VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+	VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+	VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+	int save_ftrace_enabled;
+
+	save_ftrace_enabled = __ftrace_enabled_save();
+	this_cpu_disable_ftrace();
+
+	if (ppc_md.machine_kexec)
+		ppc_md.machine_kexec(image);
+	else
+		default_machine_kexec(image);
+
+	this_cpu_enable_ftrace();
+	__ftrace_enabled_restore(save_ftrace_enabled);
+
+	/* Fall back to normal restart if we're still alive. */
+	machine_restart(NULL);
+	for(;;);
+}
+
+void __init reserve_crashkernel(void)
+{
+	unsigned long long crash_size, crash_base, total_mem_sz;
+	int ret;
+
+	total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size();
+	/* use common parsing */
+	ret = parse_crashkernel(boot_command_line, total_mem_sz,
+			&crash_size, &crash_base);
+	if (ret == 0 && crash_size > 0) {
+		crashk_res.start = crash_base;
+		crashk_res.end = crash_base + crash_size - 1;
+	}
+
+	if (crashk_res.end == crashk_res.start) {
+		crashk_res.start = crashk_res.end = 0;
+		return;
+	}
+
+	/* We might have got these values via the command line or the
+	 * device tree, either way sanitise them now. */
+
+	crash_size = resource_size(&crashk_res);
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+	if (crashk_res.start != KDUMP_KERNELBASE)
+		printk("Crash kernel location must be 0x%x\n",
+				KDUMP_KERNELBASE);
+
+	crashk_res.start = KDUMP_KERNELBASE;
+#else
+	if (!crashk_res.start) {
+#ifdef CONFIG_PPC64
+		/*
+		 * On the LPAR platform place the crash kernel to mid of
+		 * RMA size (512MB or more) to ensure the crash kernel
+		 * gets enough space to place itself and some stack to be
+		 * in the first segment. At the same time normal kernel
+		 * also get enough space to allocate memory for essential
+		 * system resource in the first segment. Keep the crash
+		 * kernel starts at 128MB offset on other platforms.
+		 */
+		if (firmware_has_feature(FW_FEATURE_LPAR))
+			crashk_res.start = ppc64_rma_size / 2;
+		else
+			crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
+#else
+		crashk_res.start = KDUMP_KERNELBASE;
+#endif
+	}
+
+	crash_base = PAGE_ALIGN(crashk_res.start);
+	if (crash_base != crashk_res.start) {
+		printk("Crash kernel base must be aligned to 0x%lx\n",
+				PAGE_SIZE);
+		crashk_res.start = crash_base;
+	}
+
+#endif
+	crash_size = PAGE_ALIGN(crash_size);
+	crashk_res.end = crashk_res.start + crash_size - 1;
+
+	/* The crash region must not overlap the current kernel */
+	if (overlaps_crashkernel(__pa(_stext), _end - _stext)) {
+		printk(KERN_WARNING
+			"Crash kernel can not overlap current kernel\n");
+		crashk_res.start = crashk_res.end = 0;
+		return;
+	}
+
+	/* Crash kernel trumps memory limit */
+	if (memory_limit && memory_limit <= crashk_res.end) {
+		memory_limit = crashk_res.end + 1;
+		total_mem_sz = memory_limit;
+		printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
+		       memory_limit);
+	}
+
+	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+			"for crashkernel (System RAM: %ldMB)\n",
+			(unsigned long)(crash_size >> 20),
+			(unsigned long)(crashk_res.start >> 20),
+			(unsigned long)(total_mem_sz >> 20));
+
+	if (!memblock_is_region_memory(crashk_res.start, crash_size) ||
+	    memblock_reserve(crashk_res.start, crash_size)) {
+		pr_err("Failed to reserve memory for crashkernel!\n");
+		crashk_res.start = crashk_res.end = 0;
+		return;
+	}
+}
+
+int overlaps_crashkernel(unsigned long start, unsigned long size)
+{
+	return (start + size) > crashk_res.start && start <= crashk_res.end;
+}
+
+/* Values we need to export to the second kernel via the device tree. */
+static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
+static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
+
+static struct property kernel_end_prop = {
+	.name = "linux,kernel-end",
+	.length = sizeof(phys_addr_t),
+	.value = &kernel_end,
+};
+
+static struct property crashk_base_prop = {
+	.name = "linux,crashkernel-base",
+	.length = sizeof(phys_addr_t),
+	.value = &crashk_base
+};
+
+static struct property crashk_size_prop = {
+	.name = "linux,crashkernel-size",
+	.length = sizeof(phys_addr_t),
+	.value = &crashk_size,
+};
+
+static struct property memory_limit_prop = {
+	.name = "linux,memory-limit",
+	.length = sizeof(unsigned long long),
+	.value = &mem_limit,
+};
+
+#define cpu_to_be_ulong	__PASTE(cpu_to_be, BITS_PER_LONG)
+
+static void __init export_crashk_values(struct device_node *node)
+{
+	/* There might be existing crash kernel properties, but we can't
+	 * be sure what's in them, so remove them. */
+	of_remove_property(node, of_find_property(node,
+				"linux,crashkernel-base", NULL));
+	of_remove_property(node, of_find_property(node,
+				"linux,crashkernel-size", NULL));
+
+	if (crashk_res.start != 0) {
+		crashk_base = cpu_to_be_ulong(crashk_res.start),
+		of_add_property(node, &crashk_base_prop);
+		crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
+		of_add_property(node, &crashk_size_prop);
+	}
+
+	/*
+	 * memory_limit is required by the kexec-tools to limit the
+	 * crash regions to the actual memory used.
+	 */
+	mem_limit = cpu_to_be_ulong(memory_limit);
+	of_update_property(node, &memory_limit_prop);
+}
+
+static int __init kexec_setup(void)
+{
+	struct device_node *node;
+
+	node = of_find_node_by_path("/chosen");
+	if (!node)
+		return -ENOENT;
+
+	/* remove any stale properties so ours can be found */
+	of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL));
+
+	/* information needed by userspace when using default_machine_kexec */
+	kernel_end = cpu_to_be_ulong(__pa(_end));
+	of_add_property(node, &kernel_end_prop);
+
+	export_crashk_values(node);
+
+	of_node_put(node);
+	return 0;
+}
+late_initcall(kexec_setup);
diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
new file mode 100644
index 000000000..affe5dcce
--- /dev/null
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -0,0 +1,69 @@
+/*
+ * PPC32 code to handle Linux booting another kernel.
+ *
+ * Copyright (C) 2002-2003 Eric Biederman  <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * Copyright (C) 2005 IBM Corporation.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <asm/cacheflush.h>
+#include <asm/hw_irq.h>
+#include <asm/io.h>
+
+typedef void (*relocate_new_kernel_t)(
+				unsigned long indirection_page,
+				unsigned long reboot_code_buffer,
+				unsigned long start_address) __noreturn;
+
+/*
+ * This is a generic machine_kexec function suitable at least for
+ * non-OpenFirmware embedded platforms.
+ * It merely copies the image relocation code to the control page and
+ * jumps to it.
+ * A platform specific function may just call this one.
+ */
+void default_machine_kexec(struct kimage *image)
+{
+	extern const unsigned char relocate_new_kernel[];
+	extern const unsigned int relocate_new_kernel_size;
+	unsigned long page_list;
+	unsigned long reboot_code_buffer, reboot_code_buffer_phys;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* mask each interrupt so we are in a more sane state for the
+	 * kexec kernel */
+	machine_kexec_mask_interrupts();
+
+	page_list = image->head;
+
+	/* we need both effective and real address here */
+	reboot_code_buffer =
+			(unsigned long)page_address(image->control_code_page);
+	reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer);
+
+	/* copy our kernel relocation code to the control code page */
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
+
+	flush_icache_range(reboot_code_buffer,
+				reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE);
+	printk(KERN_INFO "Bye!\n");
+
+	/* now call it */
+	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+	(*rnk)(page_list, reboot_code_buffer_phys, image->start);
+}
+
+int default_machine_kexec_prepare(struct kimage *image)
+{
+	return 0;
+}
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
new file mode 100644
index 000000000..a0f6f4500
--- /dev/null
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -0,0 +1,412 @@
+/*
+ * PPC64 code to handle Linux booting another kernel.
+ *
+ * Copyright (C) 2004-2005, IBM Corp.
+ *
+ * Created by: Milton D Miller II
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+
+#include <linux/kexec.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/init_task.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/machdep.h>
+#include <asm/cacheflush.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+#include <asm/mmu.h>
+#include <asm/sections.h>	/* _end */
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/asm-prototypes.h>
+
+int default_machine_kexec_prepare(struct kimage *image)
+{
+	int i;
+	unsigned long begin, end;	/* limits of segment */
+	unsigned long low, high;	/* limits of blocked memory range */
+	struct device_node *node;
+	const unsigned long *basep;
+	const unsigned int *sizep;
+
+	/*
+	 * Since we use the kernel fault handlers and paging code to
+	 * handle the virtual mode, we must make sure no destination
+	 * overlaps kernel static data or bss.
+	 */
+	for (i = 0; i < image->nr_segments; i++)
+		if (image->segment[i].mem < __pa(_end))
+			return -ETXTBSY;
+
+	/* We also should not overwrite the tce tables */
+	for_each_node_by_type(node, "pci") {
+		basep = of_get_property(node, "linux,tce-base", NULL);
+		sizep = of_get_property(node, "linux,tce-size", NULL);
+		if (basep == NULL || sizep == NULL)
+			continue;
+
+		low = *basep;
+		high = low + (*sizep);
+
+		for (i = 0; i < image->nr_segments; i++) {
+			begin = image->segment[i].mem;
+			end = begin + image->segment[i].memsz;
+
+			if ((begin < high) && (end > low))
+				return -ETXTBSY;
+		}
+	}
+
+	return 0;
+}
+
+static void copy_segments(unsigned long ind)
+{
+	unsigned long entry;
+	unsigned long *ptr;
+	void *dest;
+	void *addr;
+
+	/*
+	 * We rely on kexec_load to create a lists that properly
+	 * initializes these pointers before they are used.
+	 * We will still crash if the list is wrong, but at least
+	 * the compiler will be quiet.
+	 */
+	ptr = NULL;
+	dest = NULL;
+
+	for (entry = ind; !(entry & IND_DONE); entry = *ptr++) {
+		addr = __va(entry & PAGE_MASK);
+
+		switch (entry & IND_FLAGS) {
+		case IND_DESTINATION:
+			dest = addr;
+			break;
+		case IND_INDIRECTION:
+			ptr = addr;
+			break;
+		case IND_SOURCE:
+			copy_page(dest, addr);
+			dest += PAGE_SIZE;
+		}
+	}
+}
+
+void kexec_copy_flush(struct kimage *image)
+{
+	long i, nr_segments = image->nr_segments;
+	struct  kexec_segment ranges[KEXEC_SEGMENT_MAX];
+
+	/* save the ranges on the stack to efficiently flush the icache */
+	memcpy(ranges, image->segment, sizeof(ranges));
+
+	/*
+	 * After this call we may not use anything allocated in dynamic
+	 * memory, including *image.
+	 *
+	 * Only globals and the stack are allowed.
+	 */
+	copy_segments(image->head);
+
+	/*
+	 * we need to clear the icache for all dest pages sometime,
+	 * including ones that were in place on the original copy
+	 */
+	for (i = 0; i < nr_segments; i++)
+		flush_icache_range((unsigned long)__va(ranges[i].mem),
+			(unsigned long)__va(ranges[i].mem + ranges[i].memsz));
+}
+
+#ifdef CONFIG_SMP
+
+static int kexec_all_irq_disabled = 0;
+
+static void kexec_smp_down(void *arg)
+{
+	local_irq_disable();
+	hard_irq_disable();
+
+	mb(); /* make sure our irqs are disabled before we say they are */
+	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
+	while(kexec_all_irq_disabled == 0)
+		cpu_relax();
+	mb(); /* make sure all irqs are disabled before this */
+	hw_breakpoint_disable();
+	/*
+	 * Now every CPU has IRQs off, we can clear out any pending
+	 * IPIs and be sure that no more will come in after this.
+	 */
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 1);
+
+	kexec_smp_wait();
+	/* NOTREACHED */
+}
+
+static void kexec_prepare_cpus_wait(int wait_state)
+{
+	int my_cpu, i, notified=-1;
+
+	hw_breakpoint_disable();
+	my_cpu = get_cpu();
+	/* Make sure each CPU has at least made it to the state we need.
+	 *
+	 * FIXME: There is a (slim) chance of a problem if not all of the CPUs
+	 * are correctly onlined.  If somehow we start a CPU on boot with RTAS
+	 * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
+	 * time, the boot CPU will timeout.  If it does eventually execute
+	 * stuff, the secondary will start up (paca_ptrs[]->cpu_start was
+	 * written) and get into a peculiar state.
+	 * If the platform supports smp_ops->take_timebase(), the secondary CPU
+	 * will probably be spinning in there.  If not (i.e. pseries), the
+	 * secondary will continue on and try to online itself/idle/etc. If it
+	 * survives that, we need to find these
+	 * possible-but-not-online-but-should-be CPUs and chaperone them into
+	 * kexec_smp_wait().
+	 */
+	for_each_online_cpu(i) {
+		if (i == my_cpu)
+			continue;
+
+		while (paca_ptrs[i]->kexec_state < wait_state) {
+			barrier();
+			if (i != notified) {
+				printk(KERN_INFO "kexec: waiting for cpu %d "
+				       "(physical %d) to enter %i state\n",
+				       i, paca_ptrs[i]->hw_cpu_id, wait_state);
+				notified = i;
+			}
+		}
+	}
+	mb();
+}
+
+/*
+ * We need to make sure each present CPU is online.  The next kernel will scan
+ * the device tree and assume primary threads are online and query secondary
+ * threads via RTAS to online them if required.  If we don't online primary
+ * threads, they will be stuck.  However, we also online secondary threads as we
+ * may be using 'cede offline'.  In this case RTAS doesn't see the secondary
+ * threads as offline -- and again, these CPUs will be stuck.
+ *
+ * So, we online all CPUs that should be running, including secondary threads.
+ */
+static void wake_offline_cpus(void)
+{
+	int cpu = 0;
+
+	for_each_present_cpu(cpu) {
+		if (!cpu_online(cpu)) {
+			printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
+			       cpu);
+			WARN_ON(cpu_up(cpu));
+		}
+	}
+}
+
+static void kexec_prepare_cpus(void)
+{
+	wake_offline_cpus();
+	smp_call_function(kexec_smp_down, NULL, /* wait */0);
+	local_irq_disable();
+	hard_irq_disable();
+
+	mb(); /* make sure IRQs are disabled before we say they are */
+	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
+
+	kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF);
+	/* we are sure every CPU has IRQs off at this point */
+	kexec_all_irq_disabled = 1;
+
+	/*
+	 * Before removing MMU mappings make sure all CPUs have entered real
+	 * mode:
+	 */
+	kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
+
+	/* after we tell the others to go down */
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 0);
+
+	put_cpu();
+}
+
+#else /* ! SMP */
+
+static void kexec_prepare_cpus(void)
+{
+	/*
+	 * move the secondarys to us so that we can copy
+	 * the new kernel 0-0x100 safely
+	 *
+	 * do this if kexec in setup.c ?
+	 *
+	 * We need to release the cpus if we are ever going from an
+	 * UP to an SMP kernel.
+	 */
+	smp_release_cpus();
+	if (ppc_md.kexec_cpu_down)
+		ppc_md.kexec_cpu_down(0, 0);
+	local_irq_disable();
+	hard_irq_disable();
+}
+
+#endif /* SMP */
+
+/*
+ * kexec thread structure and stack.
+ *
+ * We need to make sure that this is 16384-byte aligned due to the
+ * way process stacks are handled.  It also must be statically allocated
+ * or allocated as part of the kimage, because everything else may be
+ * overwritten when we copy the kexec image.  We piggyback on the
+ * "init_task" linker section here to statically allocate a stack.
+ *
+ * We could use a smaller stack if we don't care about anything using
+ * current, but that audit has not been performed.
+ */
+static union thread_union kexec_stack __init_task_data =
+	{ };
+
+/*
+ * For similar reasons to the stack above, the kexecing CPU needs to be on a
+ * static PACA; we switch to kexec_paca.
+ */
+struct paca_struct kexec_paca;
+
+/* Our assembly helper, in misc_64.S */
+extern void kexec_sequence(void *newstack, unsigned long start,
+			   void *image, void *control,
+			   void (*clear_all)(void),
+			   bool copy_with_mmu_off) __noreturn;
+
+/* too late to fail here */
+void default_machine_kexec(struct kimage *image)
+{
+	bool copy_with_mmu_off;
+
+	/* prepare control code if any */
+
+	/*
+        * If the kexec boot is the normal one, need to shutdown other cpus
+        * into our wait loop and quiesce interrupts.
+        * Otherwise, in the case of crashed mode (crashing_cpu >= 0),
+        * stopping other CPUs and collecting their pt_regs is done before
+        * using debugger IPI.
+        */
+
+	if (!kdump_in_progress())
+		kexec_prepare_cpus();
+
+	printk("kexec: Starting switchover sequence.\n");
+
+	/* switch to a staticly allocated stack.  Based on irq stack code.
+	 * We setup preempt_count to avoid using VMX in memcpy.
+	 * XXX: the task struct will likely be invalid once we do the copy!
+	 */
+	kexec_stack.thread_info.task = current_thread_info()->task;
+	kexec_stack.thread_info.flags = 0;
+	kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET;
+	kexec_stack.thread_info.cpu = current_thread_info()->cpu;
+
+	/* We need a static PACA, too; copy this CPU's PACA over and switch to
+	 * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using
+	 * non-static data.
+	 */
+	memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct));
+	kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL;
+#ifdef CONFIG_PPC_PSERIES
+	kexec_paca.lppaca_ptr = NULL;
+#endif
+	paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
+
+	setup_paca(&kexec_paca);
+
+	/*
+	 * The lppaca should be unregistered at this point so the HV won't
+	 * touch it. In the case of a crash, none of the lppacas are
+	 * unregistered so there is not much we can do about it here.
+	 */
+
+	/*
+	 * On Book3S, the copy must happen with the MMU off if we are either
+	 * using Radix page tables or we are not in an LPAR since we can
+	 * overwrite the page tables while copying.
+	 *
+	 * In an LPAR, we keep the MMU on otherwise we can't access beyond
+	 * the RMA. On BookE there is no real MMU off mode, so we have to
+	 * keep it enabled as well (but then we have bolted TLB entries).
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	copy_with_mmu_off = false;
+#else
+	copy_with_mmu_off = radix_enabled() ||
+		!(firmware_has_feature(FW_FEATURE_LPAR) ||
+		  firmware_has_feature(FW_FEATURE_PS3_LV1));
+#endif
+
+	/* Some things are best done in assembly.  Finding globals with
+	 * a toc is easier in C, so pass in what we can.
+	 */
+	kexec_sequence(&kexec_stack, image->start, image,
+		       page_address(image->control_code_page),
+		       mmu_cleanup_all, copy_with_mmu_off);
+	/* NOTREACHED */
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Values we need to export to the second kernel via the device tree. */
+static unsigned long htab_base;
+static unsigned long htab_size;
+
+static struct property htab_base_prop = {
+	.name = "linux,htab-base",
+	.length = sizeof(unsigned long),
+	.value = &htab_base,
+};
+
+static struct property htab_size_prop = {
+	.name = "linux,htab-size",
+	.length = sizeof(unsigned long),
+	.value = &htab_size,
+};
+
+static int __init export_htab_values(void)
+{
+	struct device_node *node;
+
+	/* On machines with no htab htab_address is NULL */
+	if (!htab_address)
+		return -ENODEV;
+
+	node = of_find_node_by_path("/chosen");
+	if (!node)
+		return -ENODEV;
+
+	/* remove any stale propertys so ours can be found */
+	of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL));
+	of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL));
+
+	htab_base = cpu_to_be64(__pa(htab_address));
+	of_add_property(node, &htab_base_prop);
+	htab_size = cpu_to_be64(htab_size_bytes);
+	of_add_property(node, &htab_size_prop);
+
+	of_node_put(node);
+	return 0;
+}
+late_initcall(export_htab_values);
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
new file mode 100644
index 000000000..c77e95e9b
--- /dev/null
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -0,0 +1,316 @@
+/*
+ * ppc64 code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004  IBM Corp.
+ * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016  IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation (version 2 of the License).
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <asm/ima.h>
+
+#define SLAVE_CODE_SIZE		256
+
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+	&kexec_elf64_ops,
+	NULL
+};
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+				  unsigned long buf_len)
+{
+	/* We don't support crash kernels yet. */
+	if (image->type == KEXEC_TYPE_CRASH)
+		return -EOPNOTSUPP;
+
+	return kexec_image_probe_default(image, buf, buf_len);
+}
+
+/**
+ * arch_kexec_walk_mem - call func(data) for each unreserved memory block
+ * @kbuf:	Context info for the search. Also passed to @func.
+ * @func:	Function to call for each memory block.
+ *
+ * This function is used by kexec_add_buffer and kexec_locate_mem_hole
+ * to find unreserved memory to load kexec segments into.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+int arch_kexec_walk_mem(struct kexec_buf *kbuf,
+			int (*func)(struct resource *, void *))
+{
+	int ret = 0;
+	u64 i;
+	phys_addr_t mstart, mend;
+	struct resource res = { };
+
+	if (kbuf->top_down) {
+		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
+						&mstart, &mend, NULL) {
+			/*
+			 * In memblock, end points to the first byte after the
+			 * range while in kexec, end points to the last byte
+			 * in the range.
+			 */
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
+			if (ret)
+				break;
+		}
+	} else {
+		for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend,
+					NULL) {
+			/*
+			 * In memblock, end points to the first byte after the
+			 * range while in kexec, end points to the last byte
+			 * in the range.
+			 */
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * setup_purgatory - initialize the purgatory's global variables
+ * @image:		kexec image.
+ * @slave_code:		Slave code for the purgatory.
+ * @fdt:		Flattened device tree for the next kernel.
+ * @kernel_load_addr:	Address where the kernel is loaded.
+ * @fdt_load_addr:	Address where the flattened device tree is loaded.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_purgatory(struct kimage *image, const void *slave_code,
+		    const void *fdt, unsigned long kernel_load_addr,
+		    unsigned long fdt_load_addr)
+{
+	unsigned int *slave_code_buf, master_entry;
+	int ret;
+
+	slave_code_buf = kmalloc(SLAVE_CODE_SIZE, GFP_KERNEL);
+	if (!slave_code_buf)
+		return -ENOMEM;
+
+	/* Get the slave code from the new kernel and put it in purgatory. */
+	ret = kexec_purgatory_get_set_symbol(image, "purgatory_start",
+					     slave_code_buf, SLAVE_CODE_SIZE,
+					     true);
+	if (ret) {
+		kfree(slave_code_buf);
+		return ret;
+	}
+
+	master_entry = slave_code_buf[0];
+	memcpy(slave_code_buf, slave_code, SLAVE_CODE_SIZE);
+	slave_code_buf[0] = master_entry;
+	ret = kexec_purgatory_get_set_symbol(image, "purgatory_start",
+					     slave_code_buf, SLAVE_CODE_SIZE,
+					     false);
+	kfree(slave_code_buf);
+
+	ret = kexec_purgatory_get_set_symbol(image, "kernel", &kernel_load_addr,
+					     sizeof(kernel_load_addr), false);
+	if (ret)
+		return ret;
+	ret = kexec_purgatory_get_set_symbol(image, "dt_offset", &fdt_load_addr,
+					     sizeof(fdt_load_addr), false);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * delete_fdt_mem_rsv - delete memory reservation with given address and size
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
+{
+	int i, ret, num_rsvs = fdt_num_mem_rsv(fdt);
+
+	for (i = 0; i < num_rsvs; i++) {
+		uint64_t rsv_start, rsv_size;
+
+		ret = fdt_get_mem_rsv(fdt, i, &rsv_start, &rsv_size);
+		if (ret) {
+			pr_err("Malformed device tree.\n");
+			return -EINVAL;
+		}
+
+		if (rsv_start == start && rsv_size == size) {
+			ret = fdt_del_mem_rsv(fdt, i);
+			if (ret) {
+				pr_err("Error deleting device tree reservation.\n");
+				return -EINVAL;
+			}
+
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * setup_new_fdt - modify /chosen and memory reservation for the next kernel
+ * @image:		kexec image being loaded.
+ * @fdt:		Flattened device tree for the next kernel.
+ * @initrd_load_addr:	Address where the next initrd will be loaded.
+ * @initrd_len:		Size of the next initrd, or 0 if there will be none.
+ * @cmdline:		Command line for the next kernel, or NULL if there will
+ *			be none.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_new_fdt(const struct kimage *image, void *fdt,
+		  unsigned long initrd_load_addr, unsigned long initrd_len,
+		  const char *cmdline)
+{
+	int ret, chosen_node;
+	const void *prop;
+
+	/* Remove memory reservation for the current device tree. */
+	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
+				 fdt_totalsize(initial_boot_params));
+	if (ret == 0)
+		pr_debug("Removed old device tree reservation.\n");
+	else if (ret != -ENOENT)
+		return ret;
+
+	chosen_node = fdt_path_offset(fdt, "/chosen");
+	if (chosen_node == -FDT_ERR_NOTFOUND) {
+		chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
+					      "chosen");
+		if (chosen_node < 0) {
+			pr_err("Error creating /chosen.\n");
+			return -EINVAL;
+		}
+	} else if (chosen_node < 0) {
+		pr_err("Malformed device tree: error reading /chosen.\n");
+		return -EINVAL;
+	}
+
+	/* Did we boot using an initrd? */
+	prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL);
+	if (prop) {
+		uint64_t tmp_start, tmp_end, tmp_size;
+
+		tmp_start = fdt64_to_cpu(*((const fdt64_t *) prop));
+
+		prop = fdt_getprop(fdt, chosen_node, "linux,initrd-end", NULL);
+		if (!prop) {
+			pr_err("Malformed device tree.\n");
+			return -EINVAL;
+		}
+		tmp_end = fdt64_to_cpu(*((const fdt64_t *) prop));
+
+		/*
+		 * kexec reserves exact initrd size, while firmware may
+		 * reserve a multiple of PAGE_SIZE, so check for both.
+		 */
+		tmp_size = tmp_end - tmp_start;
+		ret = delete_fdt_mem_rsv(fdt, tmp_start, tmp_size);
+		if (ret == -ENOENT)
+			ret = delete_fdt_mem_rsv(fdt, tmp_start,
+						 round_up(tmp_size, PAGE_SIZE));
+		if (ret == 0)
+			pr_debug("Removed old initrd reservation.\n");
+		else if (ret != -ENOENT)
+			return ret;
+
+		/* If there's no new initrd, delete the old initrd's info. */
+		if (initrd_len == 0) {
+			ret = fdt_delprop(fdt, chosen_node,
+					  "linux,initrd-start");
+			if (ret) {
+				pr_err("Error deleting linux,initrd-start.\n");
+				return -EINVAL;
+			}
+
+			ret = fdt_delprop(fdt, chosen_node, "linux,initrd-end");
+			if (ret) {
+				pr_err("Error deleting linux,initrd-end.\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	if (initrd_len) {
+		ret = fdt_setprop_u64(fdt, chosen_node,
+				      "linux,initrd-start",
+				      initrd_load_addr);
+		if (ret < 0)
+			goto err;
+
+		/* initrd-end is the first address after the initrd image. */
+		ret = fdt_setprop_u64(fdt, chosen_node, "linux,initrd-end",
+				      initrd_load_addr + initrd_len);
+		if (ret < 0)
+			goto err;
+
+		ret = fdt_add_mem_rsv(fdt, initrd_load_addr, initrd_len);
+		if (ret) {
+			pr_err("Error reserving initrd memory: %s\n",
+			       fdt_strerror(ret));
+			return -EINVAL;
+		}
+	}
+
+	if (cmdline != NULL) {
+		ret = fdt_setprop_string(fdt, chosen_node, "bootargs", cmdline);
+		if (ret < 0)
+			goto err;
+	} else {
+		ret = fdt_delprop(fdt, chosen_node, "bootargs");
+		if (ret && ret != -FDT_ERR_NOTFOUND) {
+			pr_err("Error deleting bootargs.\n");
+			return -EINVAL;
+		}
+	}
+
+	ret = setup_ima_buffer(image, fdt, chosen_node);
+	if (ret) {
+		pr_err("Error setting up the new device tree.\n");
+		return ret;
+	}
+
+	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	pr_err("Error setting up the new device tree.\n");
+	return -EINVAL;
+}
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
new file mode 100644
index 000000000..93e06778b
--- /dev/null
+++ b/arch/powerpc/kernel/mce.c
@@ -0,0 +1,633 @@
+/*
+ * Machine check exception handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/irq_work.h>
+
+#include <asm/machdep.h>
+#include <asm/mce.h>
+
+static DEFINE_PER_CPU(int, mce_nest_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
+
+/* Queue for delayed MCE events. */
+static DEFINE_PER_CPU(int, mce_queue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
+
+/* Queue for delayed MCE UE events. */
+static DEFINE_PER_CPU(int, mce_ue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
+					mce_ue_event_queue);
+
+static void machine_check_process_queued_event(struct irq_work *work);
+static void machine_check_ue_irq_work(struct irq_work *work);
+void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_process_ue_event(struct work_struct *work);
+
+static struct irq_work mce_event_process_work = {
+        .func = machine_check_process_queued_event,
+};
+
+static struct irq_work mce_ue_event_irq_work = {
+	.func = machine_check_ue_irq_work,
+};
+
+DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
+
+static void mce_set_error_info(struct machine_check_event *mce,
+			       struct mce_error_info *mce_err)
+{
+	mce->error_type = mce_err->error_type;
+	switch (mce_err->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
+		break;
+	case MCE_ERROR_TYPE_USER:
+		mce->u.user_error.user_error_type = mce_err->u.user_error_type;
+		break;
+	case MCE_ERROR_TYPE_RA:
+		mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type;
+		break;
+	case MCE_ERROR_TYPE_LINK:
+		mce->u.link_error.link_error_type = mce_err->u.link_error_type;
+		break;
+	case MCE_ERROR_TYPE_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/*
+ * Decode and save high level MCE information into per cpu buffer which
+ * is an array of machine_check_event structure.
+ */
+void save_mce_event(struct pt_regs *regs, long handled,
+		    struct mce_error_info *mce_err,
+		    uint64_t nip, uint64_t addr, uint64_t phys_addr)
+{
+	int index = __this_cpu_inc_return(mce_nest_count) - 1;
+	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+
+	/*
+	 * Return if we don't have enough space to log mce event.
+	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
+	 * the check below will stop buffer overrun.
+	 */
+	if (index >= MAX_MC_EVT)
+		return;
+
+	/* Populate generic machine check info */
+	mce->version = MCE_V1;
+	mce->srr0 = nip;
+	mce->srr1 = regs->msr;
+	mce->gpr3 = regs->gpr[3];
+	mce->in_use = 1;
+
+	/* Mark it recovered if we have handled it and MSR(RI=1). */
+	if (handled && (regs->msr & MSR_RI))
+		mce->disposition = MCE_DISPOSITION_RECOVERED;
+	else
+		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
+
+	mce->initiator = mce_err->initiator;
+	mce->severity = mce_err->severity;
+
+	/*
+	 * Populate the mce error_type and type-specific error_type.
+	 */
+	mce_set_error_info(mce, mce_err);
+
+	if (!addr)
+		return;
+
+	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
+		mce->u.tlb_error.effective_address_provided = true;
+		mce->u.tlb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
+		mce->u.slb_error.effective_address_provided = true;
+		mce->u.slb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
+		mce->u.erat_error.effective_address_provided = true;
+		mce->u.erat_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_USER) {
+		mce->u.user_error.effective_address_provided = true;
+		mce->u.user_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_RA) {
+		mce->u.ra_error.effective_address_provided = true;
+		mce->u.ra_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_LINK) {
+		mce->u.link_error.effective_address_provided = true;
+		mce->u.link_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+		mce->u.ue_error.effective_address_provided = true;
+		mce->u.ue_error.effective_address = addr;
+		if (phys_addr != ULONG_MAX) {
+			mce->u.ue_error.physical_address_provided = true;
+			mce->u.ue_error.physical_address = phys_addr;
+			machine_check_ue_event(mce);
+		}
+	}
+	return;
+}
+
+/*
+ * get_mce_event:
+ *	mce	Pointer to machine_check_event structure to be filled.
+ *	release Flag to indicate whether to free the event slot or not.
+ *		0 <= do not release the mce event. Caller will invoke
+ *		     release_mce_event() once event has been consumed.
+ *		1 <= release the slot.
+ *
+ *	return	1 = success
+ *		0 = failure
+ *
+ * get_mce_event() will be called by platform specific machine check
+ * handle routine and in KVM.
+ * When we call get_mce_event(), we are still in interrupt context and
+ * preemption will not be scheduled until ret_from_expect() routine
+ * is called.
+ */
+int get_mce_event(struct machine_check_event *mce, bool release)
+{
+	int index = __this_cpu_read(mce_nest_count) - 1;
+	struct machine_check_event *mc_evt;
+	int ret = 0;
+
+	/* Sanity check */
+	if (index < 0)
+		return ret;
+
+	/* Check if we have MCE info to process. */
+	if (index < MAX_MC_EVT) {
+		mc_evt = this_cpu_ptr(&mce_event[index]);
+		/* Copy the event structure and release the original */
+		if (mce)
+			*mce = *mc_evt;
+		if (release)
+			mc_evt->in_use = 0;
+		ret = 1;
+	}
+	/* Decrement the count to free the slot. */
+	if (release)
+		__this_cpu_dec(mce_nest_count);
+
+	return ret;
+}
+
+void release_mce_event(void)
+{
+	get_mce_event(NULL, true);
+}
+
+static void machine_check_ue_irq_work(struct irq_work *work)
+{
+	schedule_work(&mce_ue_event_work);
+}
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_ue_event(struct machine_check_event *evt)
+{
+	int index;
+
+	index = __this_cpu_inc_return(mce_ue_count) - 1;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__this_cpu_dec(mce_ue_count);
+		return;
+	}
+	memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
+
+	/* Queue work to process this event later. */
+	irq_work_queue(&mce_ue_event_irq_work);
+}
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_queue_event(void)
+{
+	int index;
+	struct machine_check_event evt;
+
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return;
+
+	index = __this_cpu_inc_return(mce_queue_count) - 1;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__this_cpu_dec(mce_queue_count);
+		return;
+	}
+	memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
+
+	/* Queue irq work to process this event later. */
+	irq_work_queue(&mce_event_process_work);
+}
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_process_ue_event(struct work_struct *work)
+{
+	int index;
+	struct machine_check_event *evt;
+
+	while (__this_cpu_read(mce_ue_count) > 0) {
+		index = __this_cpu_read(mce_ue_count) - 1;
+		evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+#ifdef CONFIG_MEMORY_FAILURE
+		/*
+		 * This should probably queued elsewhere, but
+		 * oh! well
+		 */
+		if (evt->error_type == MCE_ERROR_TYPE_UE) {
+			if (evt->u.ue_error.physical_address_provided) {
+				unsigned long pfn;
+
+				pfn = evt->u.ue_error.physical_address >>
+					PAGE_SHIFT;
+				memory_failure(pfn, 0);
+			} else
+				pr_warn("Failed to identify bad address from "
+					"where the uncorrectable error (UE) "
+					"was generated\n");
+		}
+#endif
+		__this_cpu_dec(mce_ue_count);
+	}
+}
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_check_process_queued_event(struct irq_work *work)
+{
+	int index;
+	struct machine_check_event *evt;
+
+	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+	/*
+	 * For now just print it to console.
+	 * TODO: log this error event to FSP or nvram.
+	 */
+	while (__this_cpu_read(mce_queue_count) > 0) {
+		index = __this_cpu_read(mce_queue_count) - 1;
+		evt = this_cpu_ptr(&mce_event_queue[index]);
+		machine_check_print_event_info(evt, false);
+		__this_cpu_dec(mce_queue_count);
+	}
+}
+
+void machine_check_print_event_info(struct machine_check_event *evt,
+				    bool user_mode)
+{
+	const char *level, *sevstr, *subtype;
+	static const char *mc_ue_types[] = {
+		"Indeterminate",
+		"Instruction fetch",
+		"Page table walk ifetch",
+		"Load/Store",
+		"Page table walk Load/Store",
+	};
+	static const char *mc_slb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_erat_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_tlb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_user_types[] = {
+		"Indeterminate",
+		"tlbie(l) invalid",
+	};
+	static const char *mc_ra_types[] = {
+		"Indeterminate",
+		"Instruction fetch (bad)",
+		"Instruction fetch (foreign)",
+		"Page table walk ifetch (bad)",
+		"Page table walk ifetch (foreign)",
+		"Load (bad)",
+		"Store (bad)",
+		"Page table walk Load/Store (bad)",
+		"Page table walk Load/Store (foreign)",
+		"Load/Store (foreign)",
+	};
+	static const char *mc_link_types[] = {
+		"Indeterminate",
+		"Instruction fetch (timeout)",
+		"Page table walk ifetch (timeout)",
+		"Load (timeout)",
+		"Store (timeout)",
+		"Page table walk Load/Store (timeout)",
+	};
+
+	/* Print things out */
+	if (evt->version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt->version);
+		return;
+	}
+	switch (evt->severity) {
+	case MCE_SEV_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case MCE_SEV_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case MCE_SEV_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case MCE_SEV_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
+	       "Recovered" : "Not recovered");
+
+	if (user_mode) {
+		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
+			evt->srr0, current->pid, current->comm);
+	} else {
+		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
+		       (void *)evt->srr0);
+	}
+
+	printk("%s  Initiator: %s\n", level,
+	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		subtype = evt->u.ue_error.ue_error_type <
+			ARRAY_SIZE(mc_ue_types) ?
+			mc_ue_types[evt->u.ue_error.ue_error_type]
+			: "Unknown";
+		printk("%s  Error type: UE [%s]\n", level, subtype);
+		if (evt->u.ue_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.ue_error.effective_address);
+		if (evt->u.ue_error.physical_address_provided)
+			printk("%s    Physical address:  %016llx\n",
+			       level, evt->u.ue_error.physical_address);
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		subtype = evt->u.slb_error.slb_error_type <
+			ARRAY_SIZE(mc_slb_types) ?
+			mc_slb_types[evt->u.slb_error.slb_error_type]
+			: "Unknown";
+		printk("%s  Error type: SLB [%s]\n", level, subtype);
+		if (evt->u.slb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.slb_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		subtype = evt->u.erat_error.erat_error_type <
+			ARRAY_SIZE(mc_erat_types) ?
+			mc_erat_types[evt->u.erat_error.erat_error_type]
+			: "Unknown";
+		printk("%s  Error type: ERAT [%s]\n", level, subtype);
+		if (evt->u.erat_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.erat_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		subtype = evt->u.tlb_error.tlb_error_type <
+			ARRAY_SIZE(mc_tlb_types) ?
+			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
+			: "Unknown";
+		printk("%s  Error type: TLB [%s]\n", level, subtype);
+		if (evt->u.tlb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.tlb_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_USER:
+		subtype = evt->u.user_error.user_error_type <
+			ARRAY_SIZE(mc_user_types) ?
+			mc_user_types[evt->u.user_error.user_error_type]
+			: "Unknown";
+		printk("%s  Error type: User [%s]\n", level, subtype);
+		if (evt->u.user_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.user_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_RA:
+		subtype = evt->u.ra_error.ra_error_type <
+			ARRAY_SIZE(mc_ra_types) ?
+			mc_ra_types[evt->u.ra_error.ra_error_type]
+			: "Unknown";
+		printk("%s  Error type: Real address [%s]\n", level, subtype);
+		if (evt->u.ra_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.ra_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_LINK:
+		subtype = evt->u.link_error.link_error_type <
+			ARRAY_SIZE(mc_link_types) ?
+			mc_link_types[evt->u.link_error.link_error_type]
+			: "Unknown";
+		printk("%s  Error type: Link [%s]\n", level, subtype);
+		if (evt->u.link_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.link_error.effective_address);
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		printk("%s  Error type: Unknown\n", level);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(machine_check_print_event_info);
+
+/*
+ * This function is called in real mode. Strictly no printk's please.
+ *
+ * regs->nip and regs->msr contains srr0 and ssr1.
+ */
+long machine_check_early(struct pt_regs *regs)
+{
+	long handled = 0;
+
+	__this_cpu_inc(irq_stat.mce_exceptions);
+
+	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+		handled = cur_cpu_spec->machine_check_early(regs);
+	return handled;
+}
+
+/* Possible meanings for HMER_DEBUG_TRIG bit being set on POWER9 */
+static enum {
+	DTRIG_UNKNOWN,
+	DTRIG_VECTOR_CI,	/* need to emulate vector CI load instr */
+	DTRIG_SUSPEND_ESCAPE,	/* need to escape from TM suspend mode */
+} hmer_debug_trig_function;
+
+static int init_debug_trig_function(void)
+{
+	int pvr;
+	struct device_node *cpun;
+	struct property *prop = NULL;
+	const char *str;
+
+	/* First look in the device tree */
+	preempt_disable();
+	cpun = of_get_cpu_node(smp_processor_id(), NULL);
+	if (cpun) {
+		of_property_for_each_string(cpun, "ibm,hmi-special-triggers",
+					    prop, str) {
+			if (strcmp(str, "bit17-vector-ci-load") == 0)
+				hmer_debug_trig_function = DTRIG_VECTOR_CI;
+			else if (strcmp(str, "bit17-tm-suspend-escape") == 0)
+				hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
+		}
+		of_node_put(cpun);
+	}
+	preempt_enable();
+
+	/* If we found the property, don't look at PVR */
+	if (prop)
+		goto out;
+
+	pvr = mfspr(SPRN_PVR);
+	/* Check for POWER9 Nimbus (scale-out) */
+	if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) {
+		/* DD2.2 and later */
+		if ((pvr & 0xfff) >= 0x202)
+			hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
+		/* DD2.0 and DD2.1 - used for vector CI load emulation */
+		else if ((pvr & 0xfff) >= 0x200)
+			hmer_debug_trig_function = DTRIG_VECTOR_CI;
+	}
+
+ out:
+	switch (hmer_debug_trig_function) {
+	case DTRIG_VECTOR_CI:
+		pr_debug("HMI debug trigger used for vector CI load\n");
+		break;
+	case DTRIG_SUSPEND_ESCAPE:
+		pr_debug("HMI debug trigger used for TM suspend escape\n");
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+__initcall(init_debug_trig_function);
+
+/*
+ * Handle HMIs that occur as a result of a debug trigger.
+ * Return values:
+ * -1 means this is not a HMI cause that we know about
+ *  0 means no further handling is required
+ *  1 means further handling is required
+ */
+long hmi_handle_debugtrig(struct pt_regs *regs)
+{
+	unsigned long hmer = mfspr(SPRN_HMER);
+	long ret = 0;
+
+	/* HMER_DEBUG_TRIG bit is used for various workarounds on P9 */
+	if (!((hmer & HMER_DEBUG_TRIG)
+	      && hmer_debug_trig_function != DTRIG_UNKNOWN))
+		return -1;
+		
+	hmer &= ~HMER_DEBUG_TRIG;
+	/* HMER is a write-AND register */
+	mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG);
+
+	switch (hmer_debug_trig_function) {
+	case DTRIG_VECTOR_CI:
+		/*
+		 * Now to avoid problems with soft-disable we
+		 * only do the emulation if we are coming from
+		 * host user space
+		 */
+		if (regs && user_mode(regs))
+			ret = local_paca->hmi_p9_special_emu = 1;
+
+		break;
+
+	default:
+		break;
+	}
+
+	/*
+	 * See if any other HMI causes remain to be handled
+	 */
+	if (hmer & mfspr(SPRN_HMEER))
+		return -1;
+
+	return ret;
+}
+
+/*
+ * Return values:
+ */
+long hmi_exception_realmode(struct pt_regs *regs)
+{	
+	int ret;
+
+	__this_cpu_inc(irq_stat.hmi_exceptions);
+
+	ret = hmi_handle_debugtrig(regs);
+	if (ret >= 0)
+		return ret;
+
+	wait_for_subcore_guest_exit();
+
+	if (ppc_md.hmi_exception_early)
+		ppc_md.hmi_exception_early(regs);
+
+	wait_for_tb_resync();
+
+	return 1;
+}
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
new file mode 100644
index 000000000..ecb375040
--- /dev/null
+++ b/arch/powerpc/kernel/mce_power.c
@@ -0,0 +1,630 @@
+/*
+ * Machine check exception handling CPU-side for power7 and power8
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce_power: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/pte-walk.h>
+#include <asm/sstep.h>
+#include <asm/exception-64s.h>
+
+/*
+ * Convert an address related to an mm to a PFN. NOTE: we are in real
+ * mode, we could potentially race with page table updates.
+ */
+static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+{
+	pte_t *ptep;
+	unsigned int shift;
+	unsigned long pfn, flags;
+	struct mm_struct *mm;
+
+	if (user_mode(regs))
+		mm = current->mm;
+	else
+		mm = &init_mm;
+
+	local_irq_save(flags);
+	ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
+
+	if (!ptep || pte_special(*ptep)) {
+		pfn = ULONG_MAX;
+		goto out;
+	}
+
+	if (shift <= PAGE_SHIFT)
+		pfn = pte_pfn(*ptep);
+	else {
+		unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+		pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
+	}
+
+out:
+	local_irq_restore(flags);
+	return pfn;
+}
+
+/* flush SLBs and reload */
+#ifdef CONFIG_PPC_BOOK3S_64
+static void flush_and_reload_slb(void)
+{
+	/* Invalidate all SLBs */
+	slb_flush_all_realmode();
+
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+	/*
+	 * If machine check is hit when in guest or in transition, we will
+	 * only flush the SLBs and continue.
+	 */
+	if (get_paca()->kvm_hstate.in_guest)
+		return;
+#endif
+	if (early_radix_enabled())
+		return;
+
+	/*
+	 * This probably shouldn't happen, but it may be possible it's
+	 * called in early boot before SLB shadows are allocated.
+	 */
+	if (!get_slb_shadow())
+		return;
+
+	slb_restore_bolted_realmode();
+}
+#endif
+
+static void flush_erat(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+		flush_and_reload_slb();
+		return;
+	}
+#endif
+	/* PPC_INVALIDATE_ERAT can only be used on ISA v3 and newer */
+	asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
+}
+
+#define MCE_FLUSH_SLB 1
+#define MCE_FLUSH_TLB 2
+#define MCE_FLUSH_ERAT 3
+
+static int mce_flush(int what)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (what == MCE_FLUSH_SLB) {
+		flush_and_reload_slb();
+		return 1;
+	}
+#endif
+	if (what == MCE_FLUSH_ERAT) {
+		flush_erat();
+		return 1;
+	}
+	if (what == MCE_FLUSH_TLB) {
+		tlbiel_all();
+		return 1;
+	}
+
+	return 0;
+}
+
+#define SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42))
+
+struct mce_ierror_table {
+	unsigned long srr1_mask;
+	unsigned long srr1_value;
+	bool nip_valid; /* nip is a valid indicator of faulting address */
+	unsigned int error_type;
+	unsigned int error_subtype;
+	unsigned int initiator;
+	unsigned int severity;
+};
+
+static const struct mce_ierror_table mce_p7_ierror_table[] = {
+{ 0x00000000001c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p8_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p9_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000080c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008100000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008140000, false,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x0000000008180000, false,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x00000000081c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+struct mce_derror_table {
+	unsigned long dsisr_value;
+	bool dar_valid; /* dar is a valid indicator of faulting address */
+	unsigned int error_type;
+	unsigned int error_subtype;
+	unsigned int initiator;
+	unsigned int severity;
+};
+
+static const struct mce_derror_table mce_p7_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p8_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p9_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, false,
+  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000020, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000010, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000008, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
+					uint64_t *phys_addr)
+{
+	/*
+	 * Carefully look at the NIP to determine
+	 * the instruction to analyse. Reading the NIP
+	 * in real-mode is tricky and can lead to recursive
+	 * faults
+	 */
+	int instr;
+	unsigned long pfn, instr_addr;
+	struct instruction_op op;
+	struct pt_regs tmp = *regs;
+
+	pfn = addr_to_pfn(regs, regs->nip);
+	if (pfn != ULONG_MAX) {
+		instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+		instr = *(unsigned int *)(instr_addr);
+		if (!analyse_instr(&op, &tmp, instr)) {
+			pfn = addr_to_pfn(regs, op.ea);
+			*addr = op.ea;
+			*phys_addr = (pfn << PAGE_SHIFT);
+			return 0;
+		}
+		/*
+		 * analyse_instr() might fail if the instruction
+		 * is not a load/store, although this is unexpected
+		 * for load/store errors or if we got the NIP
+		 * wrong
+		 */
+	}
+	*addr = 0;
+	return -1;
+}
+
+static int mce_handle_ierror(struct pt_regs *regs,
+		const struct mce_ierror_table table[],
+		struct mce_error_info *mce_err, uint64_t *addr,
+		uint64_t *phys_addr)
+{
+	uint64_t srr1 = regs->msr;
+	int handled = 0;
+	int i;
+
+	*addr = 0;
+
+	for (i = 0; table[i].srr1_mask; i++) {
+		if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
+			continue;
+
+		/* attempt to correct the error */
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_SLB:
+			handled = mce_flush(MCE_FLUSH_SLB);
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			handled = mce_flush(MCE_FLUSH_ERAT);
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			handled = mce_flush(MCE_FLUSH_TLB);
+			break;
+		}
+
+		/* now fill in mce_error_info */
+		mce_err->error_type = table[i].error_type;
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_UE:
+			mce_err->u.ue_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_SLB:
+			mce_err->u.slb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			mce_err->u.erat_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			mce_err->u.tlb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_USER:
+			mce_err->u.user_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_RA:
+			mce_err->u.ra_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_LINK:
+			mce_err->u.link_error_type = table[i].error_subtype;
+			break;
+		}
+		mce_err->severity = table[i].severity;
+		mce_err->initiator = table[i].initiator;
+		if (table[i].nip_valid) {
+			*addr = regs->nip;
+			if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+				table[i].error_type == MCE_ERROR_TYPE_UE) {
+				unsigned long pfn;
+
+				if (get_paca()->in_mce < MAX_MCE_DEPTH) {
+					pfn = addr_to_pfn(regs, regs->nip);
+					if (pfn != ULONG_MAX) {
+						*phys_addr =
+							(pfn << PAGE_SHIFT);
+					}
+				}
+			}
+		}
+		return handled;
+	}
+
+	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->initiator = MCE_INITIATOR_CPU;
+
+	return 0;
+}
+
+static int mce_handle_derror(struct pt_regs *regs,
+		const struct mce_derror_table table[],
+		struct mce_error_info *mce_err, uint64_t *addr,
+		uint64_t *phys_addr)
+{
+	uint64_t dsisr = regs->dsisr;
+	int handled = 0;
+	int found = 0;
+	int i;
+
+	*addr = 0;
+
+	for (i = 0; table[i].dsisr_value; i++) {
+		if (!(dsisr & table[i].dsisr_value))
+			continue;
+
+		/* attempt to correct the error */
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_SLB:
+			if (mce_flush(MCE_FLUSH_SLB))
+				handled = 1;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			if (mce_flush(MCE_FLUSH_ERAT))
+				handled = 1;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			if (mce_flush(MCE_FLUSH_TLB))
+				handled = 1;
+			break;
+		}
+
+		/*
+		 * Attempt to handle multiple conditions, but only return
+		 * one. Ensure uncorrectable errors are first in the table
+		 * to match.
+		 */
+		if (found)
+			continue;
+
+		/* now fill in mce_error_info */
+		mce_err->error_type = table[i].error_type;
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_UE:
+			mce_err->u.ue_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_SLB:
+			mce_err->u.slb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			mce_err->u.erat_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			mce_err->u.tlb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_USER:
+			mce_err->u.user_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_RA:
+			mce_err->u.ra_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_LINK:
+			mce_err->u.link_error_type = table[i].error_subtype;
+			break;
+		}
+		mce_err->severity = table[i].severity;
+		mce_err->initiator = table[i].initiator;
+		if (table[i].dar_valid)
+			*addr = regs->dar;
+		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+				table[i].error_type == MCE_ERROR_TYPE_UE) {
+			/*
+			 * We do a maximum of 4 nested MCE calls, see
+			 * kernel/exception-64s.h
+			 */
+			if (get_paca()->in_mce < MAX_MCE_DEPTH)
+				mce_find_instr_ea_and_phys(regs, addr,
+							   phys_addr);
+		}
+		found = 1;
+	}
+
+	if (found)
+		return handled;
+
+	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->initiator = MCE_INITIATOR_CPU;
+
+	return 0;
+}
+
+static long mce_handle_ue_error(struct pt_regs *regs)
+{
+	long handled = 0;
+
+	/*
+	 * On specific SCOM read via MMIO we may get a machine check
+	 * exception with SRR0 pointing inside opal. If that is the
+	 * case OPAL may have recovery address to re-read SCOM data in
+	 * different way and hence we can recover from this MC.
+	 */
+
+	if (ppc_md.mce_check_early_recovery) {
+		if (ppc_md.mce_check_early_recovery(regs))
+			handled = 1;
+	}
+	return handled;
+}
+
+static long mce_handle_error(struct pt_regs *regs,
+		const struct mce_derror_table dtable[],
+		const struct mce_ierror_table itable[])
+{
+	struct mce_error_info mce_err = { 0 };
+	uint64_t addr, phys_addr = ULONG_MAX;
+	uint64_t srr1 = regs->msr;
+	long handled;
+
+	if (SRR1_MC_LOADSTORE(srr1))
+		handled = mce_handle_derror(regs, dtable, &mce_err, &addr,
+				&phys_addr);
+	else
+		handled = mce_handle_ierror(regs, itable, &mce_err, &addr,
+				&phys_addr);
+
+	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
+		handled = mce_handle_ue_error(regs);
+
+	save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
+
+	return handled;
+}
+
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
+{
+	/* P7 DD1 leaves top bits of DSISR undefined */
+	regs->dsisr &= 0x0000ffff;
+
+	return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
+}
+
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
+{
+	return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
+}
+
+long __machine_check_early_realmode_p9(struct pt_regs *regs)
+{
+	/*
+	 * On POWER9 DD2.1 and below, it's possible to get a machine check
+	 * caused by a paste instruction where only DSISR bit 25 is set. This
+	 * will result in the MCE handler seeing an unknown event and the kernel
+	 * crashing. An MCE that occurs like this is spurious, so we don't need
+	 * to do anything in terms of servicing it. If there is something that
+	 * needs to be serviced, the CPU will raise the MCE again with the
+	 * correct DSISR so that it can be serviced properly. So detect this
+	 * case and mark it as handled.
+	 */
+	if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000)
+		return 1;
+
+	return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
+}
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
new file mode 100644
index 000000000..0b196cdcd
--- /dev/null
+++ b/arch/powerpc/kernel/misc.S
@@ -0,0 +1,120 @@
+/*
+ * This file contains miscellaneous low-level functions.
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ *
+ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com)
+ *
+ * setjmp/longjmp code by Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+#include <asm/export.h>
+
+	.text
+
+/*
+ * Returns (address we are running at) - (address we were linked at)
+ * for use before the text and data are mapped to KERNELBASE.
+
+ * add_reloc_offset(x) returns x + reloc_offset().
+ */
+
+_GLOBAL(reloc_offset)
+	li	r3, 0
+_GLOBAL(add_reloc_offset)
+	mflr	r0
+	bl	1f
+1:	mflr	r5
+	PPC_LL	r4,(2f-1b)(r5)
+	subf	r5,r4,r5
+	add	r3,r3,r5
+	mtlr	r0
+	blr
+
+	.align	3
+2:	PPC_LONG 1b
+
+_GLOBAL(setjmp)
+	mflr	r0
+	PPC_STL	r0,0(r3)
+	PPC_STL	r1,SZL(r3)
+	PPC_STL	r2,2*SZL(r3)
+#ifdef CONFIG_PPC32
+	mfcr	r12
+	stmw	r12, 3*SZL(r3)
+#else
+	mfcr	r0
+	PPC_STL	r0,3*SZL(r3)
+	PPC_STL	r13,4*SZL(r3)
+	PPC_STL	r14,5*SZL(r3)
+	PPC_STL	r15,6*SZL(r3)
+	PPC_STL	r16,7*SZL(r3)
+	PPC_STL	r17,8*SZL(r3)
+	PPC_STL	r18,9*SZL(r3)
+	PPC_STL	r19,10*SZL(r3)
+	PPC_STL	r20,11*SZL(r3)
+	PPC_STL	r21,12*SZL(r3)
+	PPC_STL	r22,13*SZL(r3)
+	PPC_STL	r23,14*SZL(r3)
+	PPC_STL	r24,15*SZL(r3)
+	PPC_STL	r25,16*SZL(r3)
+	PPC_STL	r26,17*SZL(r3)
+	PPC_STL	r27,18*SZL(r3)
+	PPC_STL	r28,19*SZL(r3)
+	PPC_STL	r29,20*SZL(r3)
+	PPC_STL	r30,21*SZL(r3)
+	PPC_STL	r31,22*SZL(r3)
+#endif
+	li	r3,0
+	blr
+
+_GLOBAL(longjmp)
+#ifdef CONFIG_PPC32
+	lmw	r12, 3*SZL(r3)
+	mtcrf	0x38, r12
+#else
+	PPC_LL	r13,4*SZL(r3)
+	PPC_LL	r14,5*SZL(r3)
+	PPC_LL	r15,6*SZL(r3)
+	PPC_LL	r16,7*SZL(r3)
+	PPC_LL	r17,8*SZL(r3)
+	PPC_LL	r18,9*SZL(r3)
+	PPC_LL	r19,10*SZL(r3)
+	PPC_LL	r20,11*SZL(r3)
+	PPC_LL	r21,12*SZL(r3)
+	PPC_LL	r22,13*SZL(r3)
+	PPC_LL	r23,14*SZL(r3)
+	PPC_LL	r24,15*SZL(r3)
+	PPC_LL	r25,16*SZL(r3)
+	PPC_LL	r26,17*SZL(r3)
+	PPC_LL	r27,18*SZL(r3)
+	PPC_LL	r28,19*SZL(r3)
+	PPC_LL	r29,20*SZL(r3)
+	PPC_LL	r30,21*SZL(r3)
+	PPC_LL	r31,22*SZL(r3)
+	PPC_LL	r0,3*SZL(r3)
+	mtcrf	0x38,r0
+#endif
+	PPC_LL	r0,0(r3)
+	PPC_LL	r1,SZL(r3)
+	PPC_LL	r2,2*SZL(r3)
+	mtlr	r0
+	mr.	r3, r4
+	bnelr
+	li	r3, 1
+	blr
+
+_GLOBAL(current_stack_pointer)
+	PPC_LL	r3,0(r1)
+	blr
+EXPORT_SYMBOL(current_stack_pointer)
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
new file mode 100644
index 000000000..695b24a2d
--- /dev/null
+++ b/arch/powerpc/kernel/misc_32.S
@@ -0,0 +1,1099 @@
+/*
+ * This file contains miscellaneous low-level functions.
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ *
+ * kexec bits:
+ * Copyright (C) 2002-2003 Eric Biederman  <ebiederm@xmission.com>
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * PPC44x port. Copyright (C) 2011,  IBM Corporation
+ * 		Author: Suzuki Poulose <suzuki@in.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sys.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/cputable.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+#include <asm/kexec.h>
+#include <asm/bug.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+
+	.text
+
+/*
+ * We store the saved ksp_limit in the unused part
+ * of the STACK_FRAME_OVERHEAD
+ */
+_GLOBAL(call_do_softirq)
+	mflr	r0
+	stw	r0,4(r1)
+	lwz	r10,THREAD+KSP_LIMIT(r2)
+	addi	r11,r3,THREAD_INFO_GAP
+	stwu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
+	mr	r1,r3
+	stw	r10,8(r1)
+	stw	r11,THREAD+KSP_LIMIT(r2)
+	bl	__do_softirq
+	lwz	r10,8(r1)
+	lwz	r1,0(r1)
+	lwz	r0,4(r1)
+	stw	r10,THREAD+KSP_LIMIT(r2)
+	mtlr	r0
+	blr
+
+/*
+ * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
+ */
+_GLOBAL(call_do_irq)
+	mflr	r0
+	stw	r0,4(r1)
+	lwz	r10,THREAD+KSP_LIMIT(r2)
+	addi	r11,r4,THREAD_INFO_GAP
+	stwu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
+	mr	r1,r4
+	stw	r10,8(r1)
+	stw	r11,THREAD+KSP_LIMIT(r2)
+	bl	__do_irq
+	lwz	r10,8(r1)
+	lwz	r1,0(r1)
+	lwz	r0,4(r1)
+	stw	r10,THREAD+KSP_LIMIT(r2)
+	mtlr	r0
+	blr
+
+/*
+ * This returns the high 64 bits of the product of two 64-bit numbers.
+ */
+_GLOBAL(mulhdu)
+	cmpwi	r6,0
+	cmpwi	cr1,r3,0
+	mr	r10,r4
+	mulhwu	r4,r4,r5
+	beq	1f
+	mulhwu	r0,r10,r6
+	mullw	r7,r10,r5
+	addc	r7,r0,r7
+	addze	r4,r4
+1:	beqlr	cr1		/* all done if high part of A is 0 */
+	mullw	r9,r3,r5
+	mulhwu	r10,r3,r5
+	beq	2f
+	mullw	r0,r3,r6
+	mulhwu	r8,r3,r6
+	addc	r7,r0,r7
+	adde	r4,r4,r8
+	addze	r10,r10
+2:	addc	r4,r4,r9
+	addze	r3,r10
+	blr
+
+/*
+ * reloc_got2 runs through the .got2 section adding an offset
+ * to each entry.
+ */
+_GLOBAL(reloc_got2)
+	mflr	r11
+	lis	r7,__got2_start@ha
+	addi	r7,r7,__got2_start@l
+	lis	r8,__got2_end@ha
+	addi	r8,r8,__got2_end@l
+	subf	r8,r7,r8
+	srwi.	r8,r8,2
+	beqlr
+	mtctr	r8
+	bl	1f
+1:	mflr	r0
+	lis	r4,1b@ha
+	addi	r4,r4,1b@l
+	subf	r0,r4,r0
+	add	r7,r0,r7
+2:	lwz	r0,0(r7)
+	add	r0,r0,r3
+	stw	r0,0(r7)
+	addi	r7,r7,4
+	bdnz	2b
+	mtlr	r11
+	blr
+
+/*
+ * call_setup_cpu - call the setup_cpu function for this cpu
+ * r3 = data offset, r24 = cpu number
+ *
+ * Setup function is called with:
+ *   r3 = data offset
+ *   r4 = ptr to CPU spec (relocated)
+ */
+_GLOBAL(call_setup_cpu)
+	addis	r4,r3,cur_cpu_spec@ha
+	addi	r4,r4,cur_cpu_spec@l
+	lwz	r4,0(r4)
+	add	r4,r4,r3
+	lwz	r5,CPU_SPEC_SETUP(r4)
+	cmpwi	0,r5,0
+	add	r5,r5,r3
+	beqlr
+	mtctr	r5
+	bctr
+
+#if defined(CONFIG_CPU_FREQ_PMAC) && defined(CONFIG_6xx)
+
+/* This gets called by via-pmu.c to switch the PLL selection
+ * on 750fx CPU. This function should really be moved to some
+ * other place (as most of the cpufreq code in via-pmu
+ */
+_GLOBAL(low_choose_750fx_pll)
+	/* Clear MSR:EE */
+	mfmsr	r7
+	rlwinm	r0,r7,0,17,15
+	mtmsr	r0
+
+	/* If switching to PLL1, disable HID0:BTIC */
+	cmplwi	cr0,r3,0
+	beq	1f
+	mfspr	r5,SPRN_HID0
+	rlwinm	r5,r5,0,27,25
+	sync
+	mtspr	SPRN_HID0,r5
+	isync
+	sync
+
+1:
+	/* Calc new HID1 value */
+	mfspr	r4,SPRN_HID1	/* Build a HID1:PS bit from parameter */
+	rlwinm	r5,r3,16,15,15	/* Clear out HID1:PS from value read */
+	rlwinm	r4,r4,0,16,14	/* Could have I used rlwimi here ? */
+	or	r4,r4,r5
+	mtspr	SPRN_HID1,r4
+
+	/* Store new HID1 image */
+	CURRENT_THREAD_INFO(r6, r1)
+	lwz	r6,TI_CPU(r6)
+	slwi	r6,r6,2
+	addis	r6,r6,nap_save_hid1@ha
+	stw	r4,nap_save_hid1@l(r6)
+
+	/* If switching to PLL0, enable HID0:BTIC */
+	cmplwi	cr0,r3,0
+	bne	1f
+	mfspr	r5,SPRN_HID0
+	ori	r5,r5,HID0_BTIC
+	sync
+	mtspr	SPRN_HID0,r5
+	isync
+	sync
+
+1:
+	/* Return */
+	mtmsr	r7
+	blr
+
+_GLOBAL(low_choose_7447a_dfs)
+	/* Clear MSR:EE */
+	mfmsr	r7
+	rlwinm	r0,r7,0,17,15
+	mtmsr	r0
+	
+	/* Calc new HID1 value */
+	mfspr	r4,SPRN_HID1
+	insrwi	r4,r3,1,9	/* insert parameter into bit 9 */
+	sync
+	mtspr	SPRN_HID1,r4
+	sync
+	isync
+
+	/* Return */
+	mtmsr	r7
+	blr
+
+#endif /* CONFIG_CPU_FREQ_PMAC && CONFIG_6xx */
+
+/*
+ * complement mask on the msr then "or" some values on.
+ *     _nmask_and_or_msr(nmask, value_to_or)
+ */
+_GLOBAL(_nmask_and_or_msr)
+	mfmsr	r0		/* Get current msr */
+	andc	r0,r0,r3	/* And off the bits set in r3 (first parm) */
+	or	r0,r0,r4	/* Or on the bits in r4 (second parm) */
+	SYNC			/* Some chip revs have problems here... */
+	mtmsr	r0		/* Update machine state */
+	isync
+	blr			/* Done */
+
+#ifdef CONFIG_40x
+
+/*
+ * Do an IO access in real mode
+ */
+_GLOBAL(real_readb)
+	mfmsr	r7
+	rlwinm	r0,r7,0,~MSR_DR
+	sync
+	mtmsr	r0
+	sync
+	isync
+	lbz	r3,0(r3)
+	sync
+	mtmsr	r7
+	sync
+	isync
+	blr
+
+	/*
+ * Do an IO access in real mode
+ */
+_GLOBAL(real_writeb)
+	mfmsr	r7
+	rlwinm	r0,r7,0,~MSR_DR
+	sync
+	mtmsr	r0
+	sync
+	isync
+	stb	r3,0(r4)
+	sync
+	mtmsr	r7
+	sync
+	isync
+	blr
+
+#endif /* CONFIG_40x */
+
+
+/*
+ * Flush instruction cache.
+ * This is a no-op on the 601.
+ */
+#ifndef CONFIG_PPC_8xx
+_GLOBAL(flush_instruction_cache)
+#if defined(CONFIG_4xx)
+#ifdef CONFIG_403GCX
+	li      r3, 512
+	mtctr   r3
+	lis     r4, KERNELBASE@h
+1:	iccci   0, r4
+	addi    r4, r4, 16
+	bdnz    1b
+#else
+	lis	r3, KERNELBASE@h
+	iccci	0,r3
+#endif
+#elif defined(CONFIG_FSL_BOOKE)
+BEGIN_FTR_SECTION
+	mfspr   r3,SPRN_L1CSR0
+	ori     r3,r3,L1CSR0_CFI|L1CSR0_CLFC
+	/* msync; isync recommended here */
+	mtspr   SPRN_L1CSR0,r3
+	isync
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
+	mfspr	r3,SPRN_L1CSR1
+	ori	r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR
+	mtspr	SPRN_L1CSR1,r3
+#else
+	mfspr	r3,SPRN_PVR
+	rlwinm	r3,r3,16,16,31
+	cmpwi	0,r3,1
+	beqlr			/* for 601, do nothing */
+	/* 603/604 processor - use invalidate-all bit in HID0 */
+	mfspr	r3,SPRN_HID0
+	ori	r3,r3,HID0_ICFI
+	mtspr	SPRN_HID0,r3
+#endif /* CONFIG_4xx */
+	isync
+	blr
+EXPORT_SYMBOL(flush_instruction_cache)
+#endif /* CONFIG_PPC_8xx */
+
+/*
+ * Write any modified data cache blocks out to memory
+ * and invalidate the corresponding instruction cache blocks.
+ * This is a no-op on the 601.
+ *
+ * flush_icache_range(unsigned long start, unsigned long stop)
+ */
+_GLOBAL(flush_icache_range)
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr				/* for 601, do nothing */
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+	rlwinm	r3,r3,0,0,31 - L1_CACHE_SHIFT
+	subf	r4,r3,r4
+	addi	r4,r4,L1_CACHE_BYTES - 1
+	srwi.	r4,r4,L1_CACHE_SHIFT
+	beqlr
+	mtctr	r4
+	mr	r6,r3
+1:	dcbst	0,r3
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	1b
+	sync				/* wait for dcbst's to get to ram */
+#ifndef CONFIG_44x
+	mtctr	r4
+2:	icbi	0,r6
+	addi	r6,r6,L1_CACHE_BYTES
+	bdnz	2b
+#else
+	/* Flash invalidate on 44x because we are passed kmapped addresses and
+	   this doesn't work for userspace pages due to the virtually tagged
+	   icache.  Sigh. */
+	iccci	0, r0
+#endif
+	sync				/* additional sync needed on g4 */
+	isync
+	blr
+_ASM_NOKPROBE_SYMBOL(flush_icache_range)
+EXPORT_SYMBOL(flush_icache_range)
+
+/*
+ * Flush a particular page from the data cache to RAM.
+ * Note: this is necessary because the instruction cache does *not*
+ * snoop from the data cache.
+ * This is a no-op on the 601 which has a unified cache.
+ *
+ *	void __flush_dcache_icache(void *page)
+ */
+_GLOBAL(__flush_dcache_icache)
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
+	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
+	mtctr	r4
+	mr	r6,r3
+0:	dcbst	0,r3				/* Write line to ram */
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	0b
+	sync
+#ifdef CONFIG_44x
+	/* We don't flush the icache on 44x. Those have a virtual icache
+	 * and we don't have access to the virtual address here (it's
+	 * not the page vaddr but where it's mapped in user space). The
+	 * flushing of the icache on these is handled elsewhere, when
+	 * a change in the address space occurs, before returning to
+	 * user space
+	 */
+BEGIN_MMU_FTR_SECTION
+	blr
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
+#endif /* CONFIG_44x */
+	mtctr	r4
+1:	icbi	0,r6
+	addi	r6,r6,L1_CACHE_BYTES
+	bdnz	1b
+	sync
+	isync
+	blr
+
+#ifndef CONFIG_BOOKE
+/*
+ * Flush a particular page from the data cache to RAM, identified
+ * by its physical address.  We turn off the MMU so we can just use
+ * the physical address (this may be a highmem page without a kernel
+ * mapping).
+ *
+ *	void __flush_dcache_icache_phys(unsigned long physaddr)
+ */
+_GLOBAL(__flush_dcache_icache_phys)
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr					/* for 601, do nothing */
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+	mfmsr	r10
+	rlwinm	r0,r10,0,28,26			/* clear DR */
+	mtmsr	r0
+	isync
+	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
+	li	r4,PAGE_SIZE/L1_CACHE_BYTES	/* Number of lines in a page */
+	mtctr	r4
+	mr	r6,r3
+0:	dcbst	0,r3				/* Write line to ram */
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	0b
+	sync
+	mtctr	r4
+1:	icbi	0,r6
+	addi	r6,r6,L1_CACHE_BYTES
+	bdnz	1b
+	sync
+	mtmsr	r10				/* restore DR */
+	isync
+	blr
+#endif /* CONFIG_BOOKE */
+
+/*
+ * Copy a whole page.  We use the dcbz instruction on the destination
+ * to reduce memory traffic (it eliminates the unnecessary reads of
+ * the destination into cache).  This requires that the destination
+ * is cacheable.
+ */
+#define COPY_16_BYTES		\
+	lwz	r6,4(r4);	\
+	lwz	r7,8(r4);	\
+	lwz	r8,12(r4);	\
+	lwzu	r9,16(r4);	\
+	stw	r6,4(r3);	\
+	stw	r7,8(r3);	\
+	stw	r8,12(r3);	\
+	stwu	r9,16(r3)
+
+_GLOBAL(copy_page)
+	addi	r3,r3,-4
+	addi	r4,r4,-4
+
+	li	r5,4
+
+#if MAX_COPY_PREFETCH > 1
+	li	r0,MAX_COPY_PREFETCH
+	li	r11,4
+	mtctr	r0
+11:	dcbt	r11,r4
+	addi	r11,r11,L1_CACHE_BYTES
+	bdnz	11b
+#else /* MAX_COPY_PREFETCH == 1 */
+	dcbt	r5,r4
+	li	r11,L1_CACHE_BYTES+4
+#endif /* MAX_COPY_PREFETCH */
+	li	r0,PAGE_SIZE/L1_CACHE_BYTES - MAX_COPY_PREFETCH
+	crclr	4*cr0+eq
+2:
+	mtctr	r0
+1:
+	dcbt	r11,r4
+	dcbz	r5,r3
+	COPY_16_BYTES
+#if L1_CACHE_BYTES >= 32
+	COPY_16_BYTES
+#if L1_CACHE_BYTES >= 64
+	COPY_16_BYTES
+	COPY_16_BYTES
+#if L1_CACHE_BYTES >= 128
+	COPY_16_BYTES
+	COPY_16_BYTES
+	COPY_16_BYTES
+	COPY_16_BYTES
+#endif
+#endif
+#endif
+	bdnz	1b
+	beqlr
+	crnot	4*cr0+eq,4*cr0+eq
+	li	r0,MAX_COPY_PREFETCH
+	li	r11,4
+	b	2b
+EXPORT_SYMBOL(copy_page)
+
+/*
+ * Extended precision shifts.
+ *
+ * Updated to be valid for shift counts from 0 to 63 inclusive.
+ * -- Gabriel
+ *
+ * R3/R4 has 64 bit value
+ * R5    has shift count
+ * result in R3/R4
+ *
+ *  ashrdi3: arithmetic right shift (sign propagation)	
+ *  lshrdi3: logical right shift
+ *  ashldi3: left shift
+ */
+_GLOBAL(__ashrdi3)
+	subfic	r6,r5,32
+	srw	r4,r4,r5	# LSW = count > 31 ? 0 : LSW >> count
+	addi	r7,r5,32	# could be xori, or addi with -32
+	slw	r6,r3,r6	# t1 = count > 31 ? 0 : MSW << (32-count)
+	rlwinm	r8,r7,0,32	# t3 = (count < 32) ? 32 : 0
+	sraw	r7,r3,r7	# t2 = MSW >> (count-32)
+	or	r4,r4,r6	# LSW |= t1
+	slw	r7,r7,r8	# t2 = (count < 32) ? 0 : t2
+	sraw	r3,r3,r5	# MSW = MSW >> count
+	or	r4,r4,r7	# LSW |= t2
+	blr
+EXPORT_SYMBOL(__ashrdi3)
+
+_GLOBAL(__ashldi3)
+	subfic	r6,r5,32
+	slw	r3,r3,r5	# MSW = count > 31 ? 0 : MSW << count
+	addi	r7,r5,32	# could be xori, or addi with -32
+	srw	r6,r4,r6	# t1 = count > 31 ? 0 : LSW >> (32-count)
+	slw	r7,r4,r7	# t2 = count < 32 ? 0 : LSW << (count-32)
+	or	r3,r3,r6	# MSW |= t1
+	slw	r4,r4,r5	# LSW = LSW << count
+	or	r3,r3,r7	# MSW |= t2
+	blr
+EXPORT_SYMBOL(__ashldi3)
+
+_GLOBAL(__lshrdi3)
+	subfic	r6,r5,32
+	srw	r4,r4,r5	# LSW = count > 31 ? 0 : LSW >> count
+	addi	r7,r5,32	# could be xori, or addi with -32
+	slw	r6,r3,r6	# t1 = count > 31 ? 0 : MSW << (32-count)
+	srw	r7,r3,r7	# t2 = count < 32 ? 0 : MSW >> (count-32)
+	or	r4,r4,r6	# LSW |= t1
+	srw	r3,r3,r5	# MSW = MSW >> count
+	or	r4,r4,r7	# LSW |= t2
+	blr
+EXPORT_SYMBOL(__lshrdi3)
+
+/*
+ * 64-bit comparison: __cmpdi2(s64 a, s64 b)
+ * Returns 0 if a < b, 1 if a == b, 2 if a > b.
+ */
+_GLOBAL(__cmpdi2)
+	cmpw	r3,r5
+	li	r3,1
+	bne	1f
+	cmplw	r4,r6
+	beqlr
+1:	li	r3,0
+	bltlr
+	li	r3,2
+	blr
+EXPORT_SYMBOL(__cmpdi2)
+/*
+ * 64-bit comparison: __ucmpdi2(u64 a, u64 b)
+ * Returns 0 if a < b, 1 if a == b, 2 if a > b.
+ */
+_GLOBAL(__ucmpdi2)
+	cmplw	r3,r5
+	li	r3,1
+	bne	1f
+	cmplw	r4,r6
+	beqlr
+1:	li	r3,0
+	bltlr
+	li	r3,2
+	blr
+EXPORT_SYMBOL(__ucmpdi2)
+
+_GLOBAL(__bswapdi2)
+	rotlwi  r9,r4,8
+	rotlwi  r10,r3,8
+	rlwimi  r9,r4,24,0,7
+	rlwimi  r10,r3,24,0,7
+	rlwimi  r9,r4,24,16,23
+	rlwimi  r10,r3,24,16,23
+	mr      r3,r9
+	mr      r4,r10
+	blr
+EXPORT_SYMBOL(__bswapdi2)
+
+#ifdef CONFIG_SMP
+_GLOBAL(start_secondary_resume)
+	/* Reset stack */
+	CURRENT_THREAD_INFO(r1, r1)
+	addi	r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
+	li	r3,0
+	stw	r3,0(r1)		/* Zero the stack frame pointer	*/
+	bl	start_secondary
+	b	.
+#endif /* CONFIG_SMP */
+	
+/*
+ * This routine is just here to keep GCC happy - sigh...
+ */
+_GLOBAL(__main)
+	blr
+
+#ifdef CONFIG_KEXEC_CORE
+	/*
+	 * Must be relocatable PIC code callable as a C function.
+	 */
+	.globl relocate_new_kernel
+relocate_new_kernel:
+	/* r3 = page_list   */
+	/* r4 = reboot_code_buffer */
+	/* r5 = start_address      */
+
+#ifdef CONFIG_FSL_BOOKE
+
+	mr	r29, r3
+	mr	r30, r4
+	mr	r31, r5
+
+#define ENTRY_MAPPING_KEXEC_SETUP
+#include "fsl_booke_entry_mapping.S"
+#undef ENTRY_MAPPING_KEXEC_SETUP
+
+	mr      r3, r29
+	mr      r4, r30
+	mr      r5, r31
+
+	li	r0, 0
+#elif defined(CONFIG_44x)
+
+	/* Save our parameters */
+	mr	r29, r3
+	mr	r30, r4
+	mr	r31, r5
+
+#ifdef CONFIG_PPC_47x
+	/* Check for 47x cores */
+	mfspr	r3,SPRN_PVR
+	srwi	r3,r3,16
+	cmplwi	cr0,r3,PVR_476FPE@h
+	beq	setup_map_47x
+	cmplwi	cr0,r3,PVR_476@h
+	beq	setup_map_47x
+	cmplwi	cr0,r3,PVR_476_ISS@h
+	beq	setup_map_47x
+#endif /* CONFIG_PPC_47x */
+	
+/*
+ * Code for setting up 1:1 mapping for PPC440x for KEXEC
+ *
+ * We cannot switch off the MMU on PPC44x.
+ * So we:
+ * 1) Invalidate all the mappings except the one we are running from.
+ * 2) Create a tmp mapping for our code in the other address space(TS) and
+ *    jump to it. Invalidate the entry we started in.
+ * 3) Create a 1:1 mapping for 0-2GiB in chunks of 256M in original TS.
+ * 4) Jump to the 1:1 mapping in original TS.
+ * 5) Invalidate the tmp mapping.
+ *
+ * - Based on the kexec support code for FSL BookE
+ *
+ */
+
+	/* 
+	 * Load the PID with kernel PID (0).
+	 * Also load our MSR_IS and TID to MMUCR for TLB search.
+	 */
+	li	r3, 0
+	mtspr	SPRN_PID, r3
+	mfmsr	r4
+	andi.	r4,r4,MSR_IS@l
+	beq	wmmucr
+	oris	r3,r3,PPC44x_MMUCR_STS@h
+wmmucr:
+	mtspr	SPRN_MMUCR,r3
+	sync
+
+	/*
+	 * Invalidate all the TLB entries except the current entry
+	 * where we are running from
+	 */
+	bl	0f				/* Find our address */
+0:	mflr	r5				/* Make it accessible */
+	tlbsx	r23,0,r5			/* Find entry we are in */
+	li	r4,0				/* Start at TLB entry 0 */
+	li	r3,0				/* Set PAGEID inval value */
+1:	cmpw	r23,r4				/* Is this our entry? */
+	beq	skip				/* If so, skip the inval */
+	tlbwe	r3,r4,PPC44x_TLB_PAGEID		/* If not, inval the entry */
+skip:
+	addi	r4,r4,1				/* Increment */
+	cmpwi	r4,64				/* Are we done?	*/
+	bne	1b				/* If not, repeat */
+	isync
+
+	/* Create a temp mapping and jump to it */
+	andi.	r6, r23, 1		/* Find the index to use */
+	addi	r24, r6, 1		/* r24 will contain 1 or 2 */
+
+	mfmsr	r9			/* get the MSR */
+	rlwinm	r5, r9, 27, 31, 31	/* Extract the MSR[IS] */
+	xori	r7, r5, 1		/* Use the other address space */
+
+	/* Read the current mapping entries */
+	tlbre	r3, r23, PPC44x_TLB_PAGEID
+	tlbre	r4, r23, PPC44x_TLB_XLAT
+	tlbre	r5, r23, PPC44x_TLB_ATTRIB
+
+	/* Save our current XLAT entry */
+	mr	r25, r4
+
+	/* Extract the TLB PageSize */
+	li	r10, 1 			/* r10 will hold PageSize */
+	rlwinm	r11, r3, 0, 24, 27	/* bits 24-27 */
+
+	/* XXX: As of now we use 256M, 4K pages */
+	cmpwi	r11, PPC44x_TLB_256M
+	bne	tlb_4k
+	rotlwi	r10, r10, 28		/* r10 = 256M */
+	b	write_out
+tlb_4k:
+	cmpwi	r11, PPC44x_TLB_4K
+	bne	default
+	rotlwi	r10, r10, 12		/* r10 = 4K */
+	b	write_out
+default:
+	rotlwi	r10, r10, 10		/* r10 = 1K */
+
+write_out:
+	/*
+	 * Write out the tmp 1:1 mapping for this code in other address space
+	 * Fixup  EPN = RPN , TS=other address space
+	 */
+	insrwi	r3, r7, 1, 23		/* Bit 23 is TS for PAGEID field */
+
+	/* Write out the tmp mapping entries */
+	tlbwe	r3, r24, PPC44x_TLB_PAGEID
+	tlbwe	r4, r24, PPC44x_TLB_XLAT
+	tlbwe	r5, r24, PPC44x_TLB_ATTRIB
+
+	subi	r11, r10, 1		/* PageOffset Mask = PageSize - 1 */
+	not	r10, r11		/* Mask for PageNum */
+
+	/* Switch to other address space in MSR */
+	insrwi	r9, r7, 1, 26		/* Set MSR[IS] = r7 */
+
+	bl	1f
+1:	mflr	r8
+	addi	r8, r8, (2f-1b)		/* Find the target offset */
+
+	/* Jump to the tmp mapping */
+	mtspr	SPRN_SRR0, r8
+	mtspr	SPRN_SRR1, r9
+	rfi
+
+2:
+	/* Invalidate the entry we were executing from */
+	li	r3, 0
+	tlbwe	r3, r23, PPC44x_TLB_PAGEID
+
+	/* attribute fields. rwx for SUPERVISOR mode */
+	li	r5, 0
+	ori	r5, r5, (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G)
+
+	/* Create 1:1 mapping in 256M pages */
+	xori	r7, r7, 1			/* Revert back to Original TS */
+
+	li	r8, 0				/* PageNumber */
+	li	r6, 3				/* TLB Index, start at 3  */
+
+next_tlb:
+	rotlwi	r3, r8, 28			/* Create EPN (bits 0-3) */
+	mr	r4, r3				/* RPN = EPN  */
+	ori	r3, r3, (PPC44x_TLB_VALID | PPC44x_TLB_256M) /* SIZE = 256M, Valid */
+	insrwi	r3, r7, 1, 23			/* Set TS from r7 */
+
+	tlbwe	r3, r6, PPC44x_TLB_PAGEID	/* PageID field : EPN, V, SIZE */
+	tlbwe	r4, r6, PPC44x_TLB_XLAT		/* Address translation : RPN   */
+	tlbwe	r5, r6, PPC44x_TLB_ATTRIB	/* Attributes */
+
+	addi	r8, r8, 1			/* Increment PN */
+	addi	r6, r6, 1			/* Increment TLB Index */
+	cmpwi	r8, 8				/* Are we done ? */
+	bne	next_tlb
+	isync
+
+	/* Jump to the new mapping 1:1 */
+	li	r9,0
+	insrwi	r9, r7, 1, 26			/* Set MSR[IS] = r7 */
+
+	bl	1f
+1:	mflr	r8
+	and	r8, r8, r11			/* Get our offset within page */
+	addi	r8, r8, (2f-1b)
+
+	and	r5, r25, r10			/* Get our target PageNum */
+	or	r8, r8, r5			/* Target jump address */
+
+	mtspr	SPRN_SRR0, r8
+	mtspr	SPRN_SRR1, r9
+	rfi
+2:
+	/* Invalidate the tmp entry we used */
+	li	r3, 0
+	tlbwe	r3, r24, PPC44x_TLB_PAGEID
+	sync
+	b	ppc44x_map_done
+
+#ifdef CONFIG_PPC_47x
+
+	/* 1:1 mapping for 47x */
+
+setup_map_47x:
+
+	/*
+	 * Load the kernel pid (0) to PID and also to MMUCR[TID].
+	 * Also set the MSR IS->MMUCR STS
+	 */
+	li	r3, 0
+	mtspr	SPRN_PID, r3			/* Set PID */
+	mfmsr	r4				/* Get MSR */
+	andi.	r4, r4, MSR_IS@l		/* TS=1? */
+	beq	1f				/* If not, leave STS=0 */
+	oris	r3, r3, PPC47x_MMUCR_STS@h	/* Set STS=1 */
+1:	mtspr	SPRN_MMUCR, r3			/* Put MMUCR */
+	sync
+
+	/* Find the entry we are running from */
+	bl	2f
+2:	mflr	r23
+	tlbsx	r23, 0, r23
+	tlbre	r24, r23, 0			/* TLB Word 0 */
+	tlbre	r25, r23, 1			/* TLB Word 1 */
+	tlbre	r26, r23, 2			/* TLB Word 2 */
+
+
+	/*
+	 * Invalidates all the tlb entries by writing to 256 RPNs(r4)
+	 * of 4k page size in all  4 ways (0-3 in r3).
+	 * This would invalidate the entire UTLB including the one we are
+	 * running from. However the shadow TLB entries would help us 
+	 * to continue the execution, until we flush them (rfi/isync).
+	 */
+	addis	r3, 0, 0x8000			/* specify the way */
+	addi	r4, 0, 0			/* TLB Word0 = (EPN=0, VALID = 0) */
+	addi	r5, 0, 0
+	b	clear_utlb_entry
+
+	/* Align the loop to speed things up. from head_44x.S */
+	.align	6
+
+clear_utlb_entry:
+
+	tlbwe	r4, r3, 0
+	tlbwe	r5, r3, 1
+	tlbwe	r5, r3, 2
+	addis	r3, r3, 0x2000			/* Increment the way */
+	cmpwi	r3, 0
+	bne	clear_utlb_entry
+	addis	r3, 0, 0x8000
+	addis	r4, r4, 0x100			/* Increment the EPN */
+	cmpwi	r4, 0
+	bne	clear_utlb_entry
+
+	/* Create the entries in the other address space */
+	mfmsr	r5
+	rlwinm	r7, r5, 27, 31, 31		/* Get the TS (Bit 26) from MSR */
+	xori	r7, r7, 1			/* r7 = !TS */
+
+	insrwi	r24, r7, 1, 21			/* Change the TS in the saved TLB word 0 */
+
+	/* 
+	 * write out the TLB entries for the tmp mapping
+	 * Use way '0' so that we could easily invalidate it later.
+	 */
+	lis	r3, 0x8000			/* Way '0' */ 
+
+	tlbwe	r24, r3, 0
+	tlbwe	r25, r3, 1
+	tlbwe	r26, r3, 2
+
+	/* Update the msr to the new TS */
+	insrwi	r5, r7, 1, 26
+
+	bl	1f
+1:	mflr	r6
+	addi	r6, r6, (2f-1b)
+
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r5
+	rfi
+
+	/* 
+	 * Now we are in the tmp address space.
+	 * Create a 1:1 mapping for 0-2GiB in the original TS.
+	 */
+2:
+	li	r3, 0
+	li	r4, 0				/* TLB Word 0 */
+	li	r5, 0				/* TLB Word 1 */
+	li	r6, 0
+	ori	r6, r6, PPC47x_TLB2_S_RWX	/* TLB word 2 */
+
+	li	r8, 0				/* PageIndex */
+
+	xori	r7, r7, 1			/* revert back to original TS */
+
+write_utlb:
+	rotlwi	r5, r8, 28			/* RPN = PageIndex * 256M */
+						/* ERPN = 0 as we don't use memory above 2G */
+
+	mr	r4, r5				/* EPN = RPN */
+	ori	r4, r4, (PPC47x_TLB0_VALID | PPC47x_TLB0_256M)
+	insrwi	r4, r7, 1, 21			/* Insert the TS to Word 0 */
+
+	tlbwe	r4, r3, 0			/* Write out the entries */
+	tlbwe	r5, r3, 1
+	tlbwe	r6, r3, 2
+	addi	r8, r8, 1
+	cmpwi	r8, 8				/* Have we completed ? */
+	bne	write_utlb
+
+	/* make sure we complete the TLB write up */
+	isync
+
+	/* 
+	 * Prepare to jump to the 1:1 mapping.
+	 * 1) Extract page size of the tmp mapping
+	 *    DSIZ = TLB_Word0[22:27]
+	 * 2) Calculate the physical address of the address
+	 *    to jump to.
+	 */
+	rlwinm	r10, r24, 0, 22, 27
+
+	cmpwi	r10, PPC47x_TLB0_4K
+	bne	0f
+	li	r10, 0x1000			/* r10 = 4k */
+	bl	1f
+
+0:
+	/* Defaults to 256M */
+	lis	r10, 0x1000
+	
+	bl	1f
+1:	mflr	r4
+	addi	r4, r4, (2f-1b)			/* virtual address  of 2f */
+
+	subi	r11, r10, 1			/* offsetmask = Pagesize - 1 */
+	not	r10, r11			/* Pagemask = ~(offsetmask) */
+
+	and	r5, r25, r10			/* Physical page */
+	and	r6, r4, r11			/* offset within the current page */
+
+	or	r5, r5, r6			/* Physical address for 2f */
+
+	/* Switch the TS in MSR to the original one */
+	mfmsr	r8
+	insrwi	r8, r7, 1, 26
+
+	mtspr	SPRN_SRR1, r8
+	mtspr	SPRN_SRR0, r5
+	rfi
+
+2:
+	/* Invalidate the tmp mapping */
+	lis	r3, 0x8000			/* Way '0' */
+
+	clrrwi	r24, r24, 12			/* Clear the valid bit */
+	tlbwe	r24, r3, 0
+	tlbwe	r25, r3, 1
+	tlbwe	r26, r3, 2
+
+	/* Make sure we complete the TLB write and flush the shadow TLB */
+	isync
+
+#endif
+
+ppc44x_map_done:
+
+
+	/* Restore the parameters */
+	mr	r3, r29
+	mr	r4, r30
+	mr	r5, r31
+
+	li	r0, 0
+#else
+	li	r0, 0
+
+	/*
+	 * Set Machine Status Register to a known status,
+	 * switch the MMU off and jump to 1: in a single step.
+	 */
+
+	mr	r8, r0
+	ori     r8, r8, MSR_RI|MSR_ME
+	mtspr	SPRN_SRR1, r8
+	addi	r8, r4, 1f - relocate_new_kernel
+	mtspr	SPRN_SRR0, r8
+	sync
+	rfi
+
+1:
+#endif
+	/* from this point address translation is turned off */
+	/* and interrupts are disabled */
+
+	/* set a new stack at the bottom of our page... */
+	/* (not really needed now) */
+	addi	r1, r4, KEXEC_CONTROL_PAGE_SIZE - 8 /* for LR Save+Back Chain */
+	stw	r0, 0(r1)
+
+	/* Do the copies */
+	li	r6, 0 /* checksum */
+	mr	r0, r3
+	b	1f
+
+0:	/* top, read another word for the indirection page */
+	lwzu	r0, 4(r3)
+
+1:
+	/* is it a destination page? (r8) */
+	rlwinm.	r7, r0, 0, 31, 31 /* IND_DESTINATION (1<<0) */
+	beq	2f
+
+	rlwinm	r8, r0, 0, 0, 19 /* clear kexec flags, page align */
+	b	0b
+
+2:	/* is it an indirection page? (r3) */
+	rlwinm.	r7, r0, 0, 30, 30 /* IND_INDIRECTION (1<<1) */
+	beq	2f
+
+	rlwinm	r3, r0, 0, 0, 19 /* clear kexec flags, page align */
+	subi	r3, r3, 4
+	b	0b
+
+2:	/* are we done? */
+	rlwinm.	r7, r0, 0, 29, 29 /* IND_DONE (1<<2) */
+	beq	2f
+	b	3f
+
+2:	/* is it a source page? (r9) */
+	rlwinm.	r7, r0, 0, 28, 28 /* IND_SOURCE (1<<3) */
+	beq	0b
+
+	rlwinm	r9, r0, 0, 0, 19 /* clear kexec flags, page align */
+
+	li	r7, PAGE_SIZE / 4
+	mtctr   r7
+	subi    r9, r9, 4
+	subi    r8, r8, 4
+9:
+	lwzu    r0, 4(r9)  /* do the copy */
+	xor	r6, r6, r0
+	stwu    r0, 4(r8)
+	dcbst	0, r8
+	sync
+	icbi	0, r8
+	bdnz    9b
+
+	addi    r9, r9, 4
+	addi    r8, r8, 4
+	b	0b
+
+3:
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * execute a serializing instruction here.
+	 */
+	isync
+	sync
+
+	mfspr	r3, SPRN_PIR /* current core we are running on */
+	mr	r4, r5 /* load physical address of chunk called */
+
+	/* jump to the entry point, usually the setup routine */
+	mtlr	r5
+	blrl
+
+1:	b	1b
+
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.long relocate_new_kernel_end - relocate_new_kernel
+#endif
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
new file mode 100644
index 000000000..facc02964
--- /dev/null
+++ b/arch/powerpc/kernel/misc_64.S
@@ -0,0 +1,696 @@
+/*
+ * This file contains miscellaneous low-level functions.
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com)
+ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sys.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/kexec.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+
+	.text
+
+_GLOBAL(call_do_softirq)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
+	mr	r1,r3
+	bl	__do_softirq
+	ld	r1,0(r1)
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+
+_GLOBAL(call_do_irq)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
+	mr	r1,r4
+	bl	__do_irq
+	ld	r1,0(r1)
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+
+	.section	".toc","aw"
+PPC64_CACHES:
+	.tc		ppc64_caches[TC],ppc64_caches
+	.section	".text"
+
+/*
+ * Write any modified data cache blocks out to memory
+ * and invalidate the corresponding instruction cache blocks.
+ *
+ * flush_icache_range(unsigned long start, unsigned long stop)
+ *
+ *   flush all bytes from start through stop-1 inclusive
+ */
+
+_GLOBAL_TOC(flush_icache_range)
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+/*
+ * Flush the data cache to memory 
+ * 
+ * Different systems have different cache line sizes
+ * and in some cases i-cache and d-cache line sizes differ from
+ * each other.
+ */
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)/* Get cache block size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of cache block size */
+	srd.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+1:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	1b
+	sync
+
+/* Now invalidate the instruction cache */
+	
+	lwz	r7,ICACHEL1BLOCKSIZE(r10)	/* Get Icache block size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5
+	lwz	r9,ICACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of Icache block size */
+	srd.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+2:	icbi	0,r6
+	add	r6,r6,r7
+	bdnz	2b
+	isync
+	blr
+_ASM_NOKPROBE_SYMBOL(flush_icache_range)
+EXPORT_SYMBOL(flush_icache_range)
+
+/*
+ * Like above, but only do the D-cache.
+ *
+ * flush_dcache_range(unsigned long start, unsigned long stop)
+ *
+ *    flush all bytes from start to stop-1 inclusive
+ */
+_GLOBAL_TOC(flush_dcache_range)
+
+/*
+ * Flush the data cache to memory 
+ * 
+ * Different systems have different cache line sizes
+ */
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)	/* Get dcache block size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of dcache block size */
+	srd.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	mtctr	r8
+0:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	0b
+	sync
+	blr
+EXPORT_SYMBOL(flush_dcache_range)
+
+_GLOBAL(flush_inval_dcache_range)
+ 	ld	r10,PPC64_CACHES@toc(r2)
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)	/* Get dcache block size */
+	addi	r5,r7,-1
+	andc	r6,r3,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)/* Get log-2 of dcache block size */
+	srd.	r8,r8,r9		/* compute line count */
+	beqlr				/* nothing to do? */
+	sync
+	isync
+	mtctr	r8
+0:	dcbf	0,r6
+	add	r6,r6,r7
+	bdnz	0b
+	sync
+	isync
+	blr
+
+
+/*
+ * Flush a particular page from the data cache to RAM.
+ * Note: this is necessary because the instruction cache does *not*
+ * snoop from the data cache.
+ *
+ *	void __flush_dcache_icache(void *page)
+ */
+_GLOBAL(__flush_dcache_icache)
+/*
+ * Flush the data cache to memory 
+ * 
+ * Different systems have different cache line sizes
+ */
+
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+
+/* Flush the dcache */
+ 	ld	r7,PPC64_CACHES@toc(r2)
+	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
+	lwz	r4,DCACHEL1BLOCKSPERPAGE(r7)	/* Get # dcache blocks per page */
+	lwz	r5,DCACHEL1BLOCKSIZE(r7)	/* Get dcache block size */
+	mr	r6,r3
+	mtctr	r4
+0:	dcbst	0,r6
+	add	r6,r6,r5
+	bdnz	0b
+	sync
+
+/* Now invalidate the icache */	
+
+	lwz	r4,ICACHEL1BLOCKSPERPAGE(r7)	/* Get # icache blocks per page */
+	lwz	r5,ICACHEL1BLOCKSIZE(r7)	/* Get icache block size */
+	mtctr	r4
+1:	icbi	0,r3
+	add	r3,r3,r5
+	bdnz	1b
+	isync
+	blr
+
+_GLOBAL(__bswapdi2)
+EXPORT_SYMBOL(__bswapdi2)
+	srdi	r8,r3,32
+	rlwinm	r7,r3,8,0xffffffff
+	rlwimi	r7,r3,24,0,7
+	rlwinm	r9,r8,8,0xffffffff
+	rlwimi	r7,r3,24,16,23
+	rlwimi	r9,r8,24,0,7
+	rlwimi	r9,r8,24,16,23
+	sldi	r7,r7,32
+	or	r3,r7,r9
+	blr
+
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+_GLOBAL(rmci_on)
+	sync
+	isync
+	li	r3,0x100
+	rldicl	r3,r3,32,0
+	mfspr	r5,SPRN_HID4
+	or	r5,r5,r3
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	sync
+	blr
+
+_GLOBAL(rmci_off)
+	sync
+	isync
+	li	r3,0x100
+	rldicl	r3,r3,32,0
+	mfspr	r5,SPRN_HID4
+	andc	r5,r5,r3
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	sync
+	blr
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
+
+#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
+
+/*
+ * Do an IO access in real mode
+ */
+_GLOBAL(real_readb)
+	mfmsr	r7
+	ori	r0,r7,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsrd	r0
+	sync
+	isync
+	mfspr	r6,SPRN_HID4
+	rldicl	r5,r6,32,0
+	ori	r5,r5,0x100
+	rldicl	r5,r5,32,0
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	lbz	r3,0(r3)
+	sync
+	mtspr	SPRN_HID4,r6
+	isync
+	slbia
+	isync
+	mtmsrd	r7
+	sync
+	isync
+	blr
+
+	/*
+ * Do an IO access in real mode
+ */
+_GLOBAL(real_writeb)
+	mfmsr	r7
+	ori	r0,r7,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsrd	r0
+	sync
+	isync
+	mfspr	r6,SPRN_HID4
+	rldicl	r5,r6,32,0
+	ori	r5,r5,0x100
+	rldicl	r5,r5,32,0
+	sync
+	mtspr	SPRN_HID4,r5
+	isync
+	slbia
+	isync
+	stb	r3,0(r4)
+	sync
+	mtspr	SPRN_HID4,r6
+	isync
+	slbia
+	isync
+	mtmsrd	r7
+	sync
+	isync
+	blr
+#endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) */
+
+#ifdef CONFIG_PPC_PASEMI
+
+_GLOBAL(real_205_readb)
+	mfmsr	r7
+	ori	r0,r7,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsrd	r0
+	sync
+	isync
+	LBZCIX(R3,R0,R3)
+	isync
+	mtmsrd	r7
+	sync
+	isync
+	blr
+
+_GLOBAL(real_205_writeb)
+	mfmsr	r7
+	ori	r0,r7,MSR_DR
+	xori	r0,r0,MSR_DR
+	sync
+	mtmsrd	r0
+	sync
+	isync
+	STBCIX(R3,R0,R4)
+	isync
+	mtmsrd	r7
+	sync
+	isync
+	blr
+
+#endif /* CONFIG_PPC_PASEMI */
+
+
+#if defined(CONFIG_CPU_FREQ_PMAC64) || defined(CONFIG_CPU_FREQ_MAPLE)
+/*
+ * SCOM access functions for 970 (FX only for now)
+ *
+ * unsigned long scom970_read(unsigned int address);
+ * void scom970_write(unsigned int address, unsigned long value);
+ *
+ * The address passed in is the 24 bits register address. This code
+ * is 970 specific and will not check the status bits, so you should
+ * know what you are doing.
+ */
+_GLOBAL(scom970_read)
+	/* interrupts off */
+	mfmsr	r4
+	ori	r0,r4,MSR_EE
+	xori	r0,r0,MSR_EE
+	mtmsrd	r0,1
+
+	/* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits
+	 * (including parity). On current CPUs they must be 0'd,
+	 * and finally or in RW bit
+	 */
+	rlwinm	r3,r3,8,0,15
+	ori	r3,r3,0x8000
+
+	/* do the actual scom read */
+	sync
+	mtspr	SPRN_SCOMC,r3
+	isync
+	mfspr	r3,SPRN_SCOMD
+	isync
+	mfspr	r0,SPRN_SCOMC
+	isync
+
+	/* XXX:	fixup result on some buggy 970's (ouch ! we lost a bit, bah
+	 * that's the best we can do). Not implemented yet as we don't use
+	 * the scom on any of the bogus CPUs yet, but may have to be done
+	 * ultimately
+	 */
+
+	/* restore interrupts */
+	mtmsrd	r4,1
+	blr
+
+
+_GLOBAL(scom970_write)
+	/* interrupts off */
+	mfmsr	r5
+	ori	r0,r5,MSR_EE
+	xori	r0,r0,MSR_EE
+	mtmsrd	r0,1
+
+	/* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits
+	 * (including parity). On current CPUs they must be 0'd.
+	 */
+
+	rlwinm	r3,r3,8,0,15
+
+	sync
+	mtspr	SPRN_SCOMD,r4      /* write data */
+	isync
+	mtspr	SPRN_SCOMC,r3      /* write command */
+	isync
+	mfspr	3,SPRN_SCOMC
+	isync
+
+	/* restore interrupts */
+	mtmsrd	r5,1
+	blr
+#endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */
+
+/* kexec_wait(phys_cpu)
+ *
+ * wait for the flag to change, indicating this kernel is going away but
+ * the slave code for the next one is at addresses 0 to 100.
+ *
+ * This is used by all slaves, even those that did not find a matching
+ * paca in the secondary startup code.
+ *
+ * Physical (hardware) cpu id should be in r3.
+ */
+_GLOBAL(kexec_wait)
+	bl	1f
+1:	mflr	r5
+	addi	r5,r5,kexec_flag-1b
+
+99:	HMT_LOW
+#ifdef CONFIG_KEXEC_CORE	/* use no memory without kexec */
+	lwz	r4,0(r5)
+	cmpwi	0,r4,0
+	beq	99b
+#ifdef CONFIG_PPC_BOOK3S_64
+	li	r10,0x60
+	mfmsr	r11
+	clrrdi	r11,r11,1	/* Clear MSR_LE */
+	mtsrr0	r10
+	mtsrr1	r11
+	rfid
+#else
+	/* Create TLB entry in book3e_secondary_core_init */
+	li	r4,0
+	ba	0x60
+#endif
+#endif
+
+/* this can be in text because we won't change it until we are
+ * running in real anyways
+ */
+kexec_flag:
+	.long	0
+
+
+#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_PPC_BOOK3E
+/*
+ * BOOK3E has no real MMU mode, so we have to setup the initial TLB
+ * for a core to identity map v:0 to p:0.  This current implementation
+ * assumes that 1G is enough for kexec.
+ */
+kexec_create_tlb:
+	/*
+	 * Invalidate all non-IPROT TLB entries to avoid any TLB conflict.
+	 * IPROT TLB entries should be >= PAGE_OFFSET and thus not conflict.
+	 */
+	PPC_TLBILX_ALL(0,R0)
+	sync
+	isync
+
+	mfspr	r10,SPRN_TLB1CFG
+	andi.	r10,r10,TLBnCFG_N_ENTRY	/* Extract # entries */
+	subi	r10,r10,1	/* Last entry: no conflict with kernel text */
+	lis	r9,MAS0_TLBSEL(1)@h
+	rlwimi	r9,r10,16,4,15		/* Setup MAS0 = TLBSEL | ESEL(r9) */
+
+/* Set up a temp identity mapping v:0 to p:0 and return to it. */
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
+#define M_IF_NEEDED	MAS2_M
+#else
+#define M_IF_NEEDED	0
+#endif
+	mtspr	SPRN_MAS0,r9
+
+	lis	r9,(MAS1_VALID|MAS1_IPROT)@h
+	ori	r9,r9,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l
+	mtspr	SPRN_MAS1,r9
+
+	LOAD_REG_IMMEDIATE(r9, 0x0 | M_IF_NEEDED)
+	mtspr	SPRN_MAS2,r9
+
+	LOAD_REG_IMMEDIATE(r9, 0x0 | MAS3_SR | MAS3_SW | MAS3_SX)
+	mtspr	SPRN_MAS3,r9
+	li	r9,0
+	mtspr	SPRN_MAS7,r9
+
+	tlbwe
+	isync
+	blr
+#endif
+
+/* kexec_smp_wait(void)
+ *
+ * call with interrupts off
+ * note: this is a terminal routine, it does not save lr
+ *
+ * get phys id from paca
+ * switch to real mode
+ * mark the paca as no longer used
+ * join other cpus in kexec_wait(phys_id)
+ */
+_GLOBAL(kexec_smp_wait)
+	lhz	r3,PACAHWCPUID(r13)
+	bl	real_mode
+
+	li	r4,KEXEC_STATE_REAL_MODE
+	stb	r4,PACAKEXECSTATE(r13)
+	SYNC
+
+	b	kexec_wait
+
+/*
+ * switch to real mode (turn mmu off)
+ * we use the early kernel trick that the hardware ignores bits
+ * 0 and 1 (big endian) of the effective address in real mode
+ *
+ * don't overwrite r3 here, it is live for kexec_wait above.
+ */
+real_mode:	/* assume normal blr return */
+#ifdef CONFIG_PPC_BOOK3E
+	/* Create an identity mapping. */
+	b	kexec_create_tlb
+#else
+1:	li	r9,MSR_RI
+	li	r10,MSR_DR|MSR_IR
+	mflr	r11		/* return address to SRR0 */
+	mfmsr	r12
+	andc	r9,r12,r9
+	andc	r10,r12,r10
+
+	mtmsrd	r9,1
+	mtspr	SPRN_SRR1,r10
+	mtspr	SPRN_SRR0,r11
+	rfid
+#endif
+
+/*
+ * kexec_sequence(newstack, start, image, control, clear_all(),
+	          copy_with_mmu_off)
+ *
+ * does the grungy work with stack switching and real mode switches
+ * also does simple calls to other code
+ */
+
+_GLOBAL(kexec_sequence)
+	mflr	r0
+	std	r0,16(r1)
+
+	/* switch stacks to newstack -- &kexec_stack.stack */
+	stdu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
+	mr	r1,r3
+
+	li	r0,0
+	std	r0,16(r1)
+
+BEGIN_FTR_SECTION
+	/*
+	 * This is the best time to turn AMR/IAMR off.
+	 * key 0 is used in radix for supervisor<->user
+	 * protection, but on hash key 0 is reserved
+	 * ideally we want to enter with a clean state.
+	 * NOTE, we rely on r0 being 0 from above.
+	 */
+	mtspr	SPRN_IAMR,r0
+BEGIN_FTR_SECTION_NESTED(42)
+	mtspr	SPRN_AMOR,r0
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
+	/* save regs for local vars on new stack.
+	 * yes, we won't go back, but ...
+	 */
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	stdu	r1,-STACK_FRAME_OVERHEAD-64(r1)
+
+	/* save args into preserved regs */
+	mr	r31,r3			/* newstack (both) */
+	mr	r30,r4			/* start (real) */
+	mr	r29,r5			/* image (virt) */
+	mr	r28,r6			/* control, unused */
+	mr	r27,r7			/* clear_all() fn desc */
+	mr	r26,r8			/* copy_with_mmu_off */
+	lhz	r25,PACAHWCPUID(r13)	/* get our phys cpu from paca */
+
+	/* disable interrupts, we are overwriting kernel data next */
+#ifdef CONFIG_PPC_BOOK3E
+	wrteei	0
+#else
+	mfmsr	r3
+	rlwinm	r3,r3,0,17,15
+	mtmsrd	r3,1
+#endif
+
+	/* We need to turn the MMU off unless we are in hash mode
+	 * under a hypervisor
+	 */
+	cmpdi	r26,0
+	beq	1f
+	bl	real_mode
+1:
+	/* copy dest pages, flush whole dest image */
+	mr	r3,r29
+	bl	kexec_copy_flush	/* (image) */
+
+	/* turn off mmu now if not done earlier */
+	cmpdi	r26,0
+	bne	1f
+	bl	real_mode
+
+	/* copy  0x100 bytes starting at start to 0 */
+1:	li	r3,0
+	mr	r4,r30		/* start, aka phys mem offset */
+	li	r5,0x100
+	li	r6,0
+	bl	copy_and_flush	/* (dest, src, copy limit, start offset) */
+1:	/* assume normal blr return */
+
+	/* release other cpus to the new kernel secondary start at 0x60 */
+	mflr	r5
+	li	r6,1
+	stw	r6,kexec_flag-1b(5)
+
+	cmpdi	r27,0
+	beq	1f
+
+	/* clear out hardware hash page table and tlb */
+#ifdef PPC64_ELF_ABI_v1
+	ld	r12,0(r27)		/* deref function descriptor */
+#else
+	mr	r12,r27
+#endif
+	mtctr	r12
+	bctrl				/* mmu_hash_ops.hpte_clear_all(void); */
+
+/*
+ *   kexec image calling is:
+ *      the first 0x100 bytes of the entry point are copied to 0
+ *
+ *      all slaves branch to slave = 0x60 (absolute)
+ *              slave(phys_cpu_id);
+ *
+ *      master goes to start = entry point
+ *              start(phys_cpu_id, start, 0);
+ *
+ *
+ *   a wrapper is needed to call existing kernels, here is an approximate
+ *   description of one method:
+ *
+ * v2: (2.6.10)
+ *   start will be near the boot_block (maybe 0x100 bytes before it?)
+ *   it will have a 0x60, which will b to boot_block, where it will wait
+ *   and 0 will store phys into struct boot-block and load r3 from there,
+ *   copy kernel 0-0x100 and tell slaves to back down to 0x60 again
+ *
+ * v1: (2.6.9)
+ *    boot block will have all cpus scanning device tree to see if they
+ *    are the boot cpu ?????
+ *    other device tree differences (prop sizes, va vs pa, etc)...
+ */
+1:	mr	r3,r25	# my phys cpu
+	mr	r4,r30	# start, aka phys mem offset
+	mtlr	4
+	li	r5,0
+	blr	/* image->start(physid, image->start, 0); */
+#endif /* CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
new file mode 100644
index 000000000..2d861a366
--- /dev/null
+++ b/arch/powerpc/kernel/module.c
@@ -0,0 +1,100 @@
+/*  Kernel module help for powerpc.
+    Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
+    Copyright (C) 2008 Freescale Semiconductor, Inc.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+#include <asm/module.h>
+#include <linux/uaccess.h>
+#include <asm/firmware.h>
+#include <linux/sort.h>
+#include <asm/setup.h>
+
+static LIST_HEAD(module_bug_list);
+
+static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
+				    const Elf_Shdr *sechdrs,
+				    const char *name)
+{
+	char *secstrings;
+	unsigned int i;
+
+	secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+	for (i = 1; i < hdr->e_shnum; i++)
+		if (strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+			return &sechdrs[i];
+	return NULL;
+}
+
+int module_finalize(const Elf_Ehdr *hdr,
+		const Elf_Shdr *sechdrs, struct module *me)
+{
+	const Elf_Shdr *sect;
+	int rc;
+
+	rc = module_finalize_ftrace(me, sechdrs);
+	if (rc)
+		return rc;
+
+	/* Apply feature fixups */
+	sect = find_section(hdr, sechdrs, "__ftr_fixup");
+	if (sect != NULL)
+		do_feature_fixups(cur_cpu_spec->cpu_features,
+				  (void *)sect->sh_addr,
+				  (void *)sect->sh_addr + sect->sh_size);
+
+	sect = find_section(hdr, sechdrs, "__mmu_ftr_fixup");
+	if (sect != NULL)
+		do_feature_fixups(cur_cpu_spec->mmu_features,
+				  (void *)sect->sh_addr,
+				  (void *)sect->sh_addr + sect->sh_size);
+
+#ifdef CONFIG_PPC64
+	sect = find_section(hdr, sechdrs, "__fw_ftr_fixup");
+	if (sect != NULL)
+		do_feature_fixups(powerpc_firmware_features,
+				  (void *)sect->sh_addr,
+				  (void *)sect->sh_addr + sect->sh_size);
+#endif /* CONFIG_PPC64 */
+
+#ifdef PPC64_ELF_ABI_v1
+	sect = find_section(hdr, sechdrs, ".opd");
+	if (sect != NULL) {
+		me->arch.start_opd = sect->sh_addr;
+		me->arch.end_opd = sect->sh_addr + sect->sh_size;
+	}
+#endif /* PPC64_ELF_ABI_v1 */
+
+#ifdef CONFIG_PPC_BARRIER_NOSPEC
+	sect = find_section(hdr, sechdrs, "__spec_barrier_fixup");
+	if (sect != NULL)
+		do_barrier_nospec_fixups_range(barrier_nospec_enabled,
+				  (void *)sect->sh_addr,
+				  (void *)sect->sh_addr + sect->sh_size);
+#endif /* CONFIG_PPC_BARRIER_NOSPEC */
+
+	sect = find_section(hdr, sechdrs, "__lwsync_fixup");
+	if (sect != NULL)
+		do_lwsync_fixups(cur_cpu_spec->cpu_features,
+				 (void *)sect->sh_addr,
+				 (void *)sect->sh_addr + sect->sh_size);
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/module.lds b/arch/powerpc/kernel/module.lds
new file mode 100644
index 000000000..cea5dc124
--- /dev/null
+++ b/arch/powerpc/kernel/module.lds
@@ -0,0 +1,8 @@
+/* Force alignment of .toc section.  */
+SECTIONS
+{
+	.toc 0 : ALIGN(256)
+	{
+		*(.got .toc)
+	}
+}
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
new file mode 100644
index 000000000..88d83771f
--- /dev/null
+++ b/arch/powerpc/kernel/module_32.c
@@ -0,0 +1,312 @@
+/*  Kernel module help for PPC.
+    Copyright (C) 2001 Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/sort.h>
+#include <asm/setup.h>
+
+/* Count how many different relocations (different symbol, different
+   addend) */
+static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
+{
+	unsigned int i, r_info, r_addend, _count_relocs;
+
+	_count_relocs = 0;
+	r_info = 0;
+	r_addend = 0;
+	for (i = 0; i < num; i++)
+		/* Only count 24-bit relocs, others don't need stubs */
+		if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
+		    (r_info != ELF32_R_SYM(rela[i].r_info) ||
+		     r_addend != rela[i].r_addend)) {
+			_count_relocs++;
+			r_info = ELF32_R_SYM(rela[i].r_info);
+			r_addend = rela[i].r_addend;
+		}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	_count_relocs++;	/* add one for ftrace_caller */
+#endif
+	return _count_relocs;
+}
+
+static int relacmp(const void *_x, const void *_y)
+{
+	const Elf32_Rela *x, *y;
+
+	y = (Elf32_Rela *)_x;
+	x = (Elf32_Rela *)_y;
+
+	/* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to
+	 * make the comparison cheaper/faster. It won't affect the sorting or
+	 * the counting algorithms' performance
+	 */
+	if (x->r_info < y->r_info)
+		return -1;
+	else if (x->r_info > y->r_info)
+		return 1;
+	else if (x->r_addend < y->r_addend)
+		return -1;
+	else if (x->r_addend > y->r_addend)
+		return 1;
+	else
+		return 0;
+}
+
+static void relaswap(void *_x, void *_y, int size)
+{
+	uint32_t *x, *y, tmp;
+	int i;
+
+	y = (uint32_t *)_x;
+	x = (uint32_t *)_y;
+
+	for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) {
+		tmp = x[i];
+		x[i] = y[i];
+		y[i] = tmp;
+	}
+}
+
+/* Get the potential trampolines size required of the init and
+   non-init sections */
+static unsigned long get_plt_size(const Elf32_Ehdr *hdr,
+				  const Elf32_Shdr *sechdrs,
+				  const char *secstrings,
+				  int is_init)
+{
+	unsigned long ret = 0;
+	unsigned i;
+
+	/* Everything marked ALLOC (this includes the exported
+           symbols) */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		/* If it's called *.init*, and we're not init, we're
+                   not interested */
+		if ((strstr(secstrings + sechdrs[i].sh_name, ".init") != NULL)
+		    != is_init)
+			continue;
+
+		/* We don't want to look at debug sections. */
+		if (strstr(secstrings + sechdrs[i].sh_name, ".debug"))
+			continue;
+
+		if (sechdrs[i].sh_type == SHT_RELA) {
+			pr_debug("Found relocations in section %u\n", i);
+			pr_debug("Ptr: %p.  Number: %u\n",
+			       (void *)hdr + sechdrs[i].sh_offset,
+			       sechdrs[i].sh_size / sizeof(Elf32_Rela));
+
+			/* Sort the relocation information based on a symbol and
+			 * addend key. This is a stable O(n*log n) complexity
+			 * alogrithm but it will reduce the complexity of
+			 * count_relocs() to linear complexity O(n)
+			 */
+			sort((void *)hdr + sechdrs[i].sh_offset,
+			     sechdrs[i].sh_size / sizeof(Elf32_Rela),
+			     sizeof(Elf32_Rela), relacmp, relaswap);
+
+			ret += count_relocs((void *)hdr
+					     + sechdrs[i].sh_offset,
+					     sechdrs[i].sh_size
+					     / sizeof(Elf32_Rela))
+				* sizeof(struct ppc_plt_entry);
+		}
+	}
+
+	return ret;
+}
+
+int module_frob_arch_sections(Elf32_Ehdr *hdr,
+			      Elf32_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *me)
+{
+	unsigned int i;
+
+	/* Find .plt and .init.plt sections */
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (strcmp(secstrings + sechdrs[i].sh_name, ".init.plt") == 0)
+			me->arch.init_plt_section = i;
+		else if (strcmp(secstrings + sechdrs[i].sh_name, ".plt") == 0)
+			me->arch.core_plt_section = i;
+	}
+	if (!me->arch.core_plt_section || !me->arch.init_plt_section) {
+		pr_err("Module doesn't contain .plt or .init.plt sections.\n");
+		return -ENOEXEC;
+	}
+
+	/* Override their sizes */
+	sechdrs[me->arch.core_plt_section].sh_size
+		= get_plt_size(hdr, sechdrs, secstrings, 0);
+	sechdrs[me->arch.init_plt_section].sh_size
+		= get_plt_size(hdr, sechdrs, secstrings, 1);
+	return 0;
+}
+
+static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val)
+{
+	if (entry->jump[0] == 0x3d800000 + ((val + 0x8000) >> 16)
+	    && entry->jump[1] == 0x398c0000 + (val & 0xffff))
+		return 1;
+	return 0;
+}
+
+/* Set up a trampoline in the PLT to bounce us to the distant function */
+static uint32_t do_plt_call(void *location,
+			    Elf32_Addr val,
+			    const Elf32_Shdr *sechdrs,
+			    struct module *mod)
+{
+	struct ppc_plt_entry *entry;
+
+	pr_debug("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location);
+	/* Init, or core PLT? */
+	if (location >= mod->core_layout.base
+	    && location < mod->core_layout.base + mod->core_layout.size)
+		entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr;
+	else
+		entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr;
+
+	/* Find this entry, or if that fails, the next avail. entry */
+	while (entry->jump[0]) {
+		if (entry_matches(entry, val)) return (uint32_t)entry;
+		entry++;
+	}
+
+	entry->jump[0] = 0x3d800000+((val+0x8000)>>16); /* lis r12,sym@ha */
+	entry->jump[1] = 0x398c0000 + (val&0xffff);     /* addi r12,r12,sym@l*/
+	entry->jump[2] = 0x7d8903a6;                    /* mtctr r12 */
+	entry->jump[3] = 0x4e800420;			/* bctr */
+
+	pr_debug("Initialized plt for 0x%x at %p\n", val, entry);
+	return (uint32_t)entry;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+		       const char *strtab,
+		       unsigned int symindex,
+		       unsigned int relsec,
+		       struct module *module)
+{
+	unsigned int i;
+	Elf32_Rela *rela = (void *)sechdrs[relsec].sh_addr;
+	Elf32_Sym *sym;
+	uint32_t *location;
+	uint32_t value;
+
+	pr_debug("Applying ADD relocate section %u to %u\n", relsec,
+	       sechdrs[relsec].sh_info);
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) {
+		/* This is where to make the change */
+		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rela[i].r_offset;
+		/* This is the symbol it is referring to.  Note that all
+		   undefined symbols have been resolved.  */
+		sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+			+ ELF32_R_SYM(rela[i].r_info);
+		/* `Everything is relative'. */
+		value = sym->st_value + rela[i].r_addend;
+
+		switch (ELF32_R_TYPE(rela[i].r_info)) {
+		case R_PPC_ADDR32:
+			/* Simply set it */
+			*(uint32_t *)location = value;
+			break;
+
+		case R_PPC_ADDR16_LO:
+			/* Low half of the symbol */
+			*(uint16_t *)location = value;
+			break;
+
+		case R_PPC_ADDR16_HI:
+			/* Higher half of the symbol */
+			*(uint16_t *)location = (value >> 16);
+			break;
+
+		case R_PPC_ADDR16_HA:
+			/* Sign-adjusted lower 16 bits: PPC ELF ABI says:
+			   (((x >> 16) + ((x & 0x8000) ? 1 : 0))) & 0xFFFF.
+			   This is the same, only sane.
+			 */
+			*(uint16_t *)location = (value + 0x8000) >> 16;
+			break;
+
+		case R_PPC_REL24:
+			if ((int)(value - (uint32_t)location) < -0x02000000
+			    || (int)(value - (uint32_t)location) >= 0x02000000)
+				value = do_plt_call(location, value,
+						    sechdrs, module);
+
+			/* Only replace bits 2 through 26 */
+			pr_debug("REL24 value = %08X. location = %08X\n",
+			       value, (uint32_t)location);
+			pr_debug("Location before: %08X.\n",
+			       *(uint32_t *)location);
+			*(uint32_t *)location
+				= (*(uint32_t *)location & ~0x03fffffc)
+				| ((value - (uint32_t)location)
+				   & 0x03fffffc);
+			pr_debug("Location after: %08X.\n",
+			       *(uint32_t *)location);
+			pr_debug("ie. jump to %08X+%08X = %08X\n",
+			       *(uint32_t *)location & 0x03fffffc,
+			       (uint32_t)location,
+			       (*(uint32_t *)location & 0x03fffffc)
+			       + (uint32_t)location);
+			break;
+
+		case R_PPC_REL32:
+			/* 32-bit relative jump. */
+			*(uint32_t *)location = value - (uint32_t)location;
+			break;
+
+		default:
+			pr_err("%s: unknown ADD relocation: %u\n",
+			       module->name,
+			       ELF32_R_TYPE(rela[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+int module_finalize_ftrace(struct module *module, const Elf_Shdr *sechdrs)
+{
+	module->arch.tramp = do_plt_call(module->core_layout.base,
+					 (unsigned long)ftrace_caller,
+					 sechdrs, module);
+	if (!module->arch.tramp)
+		return -ENOENT;
+
+	return 0;
+}
+#endif
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
new file mode 100644
index 000000000..4bf81a111
--- /dev/null
+++ b/arch/powerpc/kernel/module_64.c
@@ -0,0 +1,842 @@
+/*  Kernel module help for PPC64.
+    Copyright (C) 2001, 2003 Rusty Russell IBM Corporation.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/ftrace.h>
+#include <linux/bug.h>
+#include <linux/uaccess.h>
+#include <asm/module.h>
+#include <asm/firmware.h>
+#include <asm/code-patching.h>
+#include <linux/sort.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+
+/* FIXME: We don't do .init separately.  To do this, we'd need to have
+   a separate r2 value in the init and core section, and stub between
+   them, too.
+
+   Using a magic allocator which places modules within 32MB solves
+   this, and makes other things simpler.  Anton?
+   --RR.  */
+
+#ifdef PPC64_ELF_ABI_v2
+
+/* An address is simply the address of the function. */
+typedef unsigned long func_desc_t;
+
+static func_desc_t func_desc(unsigned long addr)
+{
+	return addr;
+}
+static unsigned long func_addr(unsigned long addr)
+{
+	return addr;
+}
+static unsigned long stub_func_addr(func_desc_t func)
+{
+	return func;
+}
+
+/* PowerPC64 specific values for the Elf64_Sym st_other field.  */
+#define STO_PPC64_LOCAL_BIT	5
+#define STO_PPC64_LOCAL_MASK	(7 << STO_PPC64_LOCAL_BIT)
+#define PPC64_LOCAL_ENTRY_OFFSET(other)					\
+ (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) >> 2) << 2)
+
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+	/* sym->st_other indicates offset to local entry point
+	 * (otherwise it will assume r12 is the address of the start
+	 * of function and try to derive r2 from it). */
+	return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#else
+
+/* An address is address of the OPD entry, which contains address of fn. */
+typedef struct ppc64_opd_entry func_desc_t;
+
+static func_desc_t func_desc(unsigned long addr)
+{
+	return *(struct ppc64_opd_entry *)addr;
+}
+static unsigned long func_addr(unsigned long addr)
+{
+	return func_desc(addr).funcaddr;
+}
+static unsigned long stub_func_addr(func_desc_t func)
+{
+	return func.funcaddr;
+}
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+	return 0;
+}
+
+void *dereference_module_function_descriptor(struct module *mod, void *ptr)
+{
+	if (ptr < (void *)mod->arch.start_opd ||
+			ptr >= (void *)mod->arch.end_opd)
+		return ptr;
+
+	return dereference_function_descriptor(ptr);
+}
+#endif
+
+#define STUB_MAGIC 0x73747562 /* stub */
+
+/* Like PPC32, we need little trampolines to do > 24-bit jumps (into
+   the kernel itself).  But on PPC64, these need to be used for every
+   jump, actually, to reset r2 (TOC+0x8000). */
+struct ppc64_stub_entry
+{
+	/* 28 byte jump instruction sequence (7 instructions). We only
+	 * need 6 instructions on ABIv2 but we always allocate 7 so
+	 * so we don't have to modify the trampoline load instruction. */
+	u32 jump[7];
+	/* Used by ftrace to identify stubs */
+	u32 magic;
+	/* Data for the above code */
+	func_desc_t funcdata;
+};
+
+/*
+ * PPC64 uses 24 bit jumps, but we need to jump into other modules or
+ * the kernel which may be further.  So we jump to a stub.
+ *
+ * For ELFv1 we need to use this to set up the new r2 value (aka TOC
+ * pointer).  For ELFv2 it's the callee's responsibility to set up the
+ * new r2, but for both we need to save the old r2.
+ *
+ * We could simply patch the new r2 value and function pointer into
+ * the stub, but it's significantly shorter to put these values at the
+ * end of the stub code, and patch the stub address (32-bits relative
+ * to the TOC ptr, r2) into the stub.
+ */
+
+static u32 ppc64_stub_insns[] = {
+	0x3d620000,			/* addis   r11,r2, <high> */
+	0x396b0000,			/* addi    r11,r11, <low> */
+	/* Save current r2 value in magic place on the stack. */
+	0xf8410000|R2_STACK_OFFSET,	/* std     r2,R2_STACK_OFFSET(r1) */
+	0xe98b0020,			/* ld      r12,32(r11) */
+#ifdef PPC64_ELF_ABI_v1
+	/* Set up new r2 from function descriptor */
+	0xe84b0028,			/* ld      r2,40(r11) */
+#endif
+	0x7d8903a6,			/* mtctr   r12 */
+	0x4e800420			/* bctr */
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+int module_trampoline_target(struct module *mod, unsigned long addr,
+			     unsigned long *target)
+{
+	struct ppc64_stub_entry *stub;
+	func_desc_t funcdata;
+	u32 magic;
+
+	if (!within_module_core(addr, mod)) {
+		pr_err("%s: stub %lx not in module %s\n", __func__, addr, mod->name);
+		return -EFAULT;
+	}
+
+	stub = (struct ppc64_stub_entry *)addr;
+
+	if (probe_kernel_read(&magic, &stub->magic, sizeof(magic))) {
+		pr_err("%s: fault reading magic for stub %lx for %s\n", __func__, addr, mod->name);
+		return -EFAULT;
+	}
+
+	if (magic != STUB_MAGIC) {
+		pr_err("%s: bad magic for stub %lx for %s\n", __func__, addr, mod->name);
+		return -EFAULT;
+	}
+
+	if (probe_kernel_read(&funcdata, &stub->funcdata, sizeof(funcdata))) {
+		pr_err("%s: fault reading funcdata for stub %lx for %s\n", __func__, addr, mod->name);
+                return -EFAULT;
+	}
+
+	*target = stub_func_addr(funcdata);
+
+	return 0;
+}
+#endif
+
+/* Count how many different 24-bit relocations (different symbol,
+   different addend) */
+static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)
+{
+	unsigned int i, r_info, r_addend, _count_relocs;
+
+	/* FIXME: Only count external ones --RR */
+	_count_relocs = 0;
+	r_info = 0;
+	r_addend = 0;
+	for (i = 0; i < num; i++)
+		/* Only count 24-bit relocs, others don't need stubs */
+		if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
+		    (r_info != ELF64_R_SYM(rela[i].r_info) ||
+		     r_addend != rela[i].r_addend)) {
+			_count_relocs++;
+			r_info = ELF64_R_SYM(rela[i].r_info);
+			r_addend = rela[i].r_addend;
+		}
+
+	return _count_relocs;
+}
+
+static int relacmp(const void *_x, const void *_y)
+{
+	const Elf64_Rela *x, *y;
+
+	y = (Elf64_Rela *)_x;
+	x = (Elf64_Rela *)_y;
+
+	/* Compare the entire r_info (as opposed to ELF64_R_SYM(r_info) only) to
+	 * make the comparison cheaper/faster. It won't affect the sorting or
+	 * the counting algorithms' performance
+	 */
+	if (x->r_info < y->r_info)
+		return -1;
+	else if (x->r_info > y->r_info)
+		return 1;
+	else if (x->r_addend < y->r_addend)
+		return -1;
+	else if (x->r_addend > y->r_addend)
+		return 1;
+	else
+		return 0;
+}
+
+static void relaswap(void *_x, void *_y, int size)
+{
+	uint64_t *x, *y, tmp;
+	int i;
+
+	y = (uint64_t *)_x;
+	x = (uint64_t *)_y;
+
+	for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) {
+		tmp = x[i];
+		x[i] = y[i];
+		y[i] = tmp;
+	}
+}
+
+/* Get size of potential trampolines required. */
+static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
+				    const Elf64_Shdr *sechdrs)
+{
+	/* One extra reloc so it's always 0-funcaddr terminated */
+	unsigned long relocs = 1;
+	unsigned i;
+
+	/* Every relocated section... */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type == SHT_RELA) {
+			pr_debug("Found relocations in section %u\n", i);
+			pr_debug("Ptr: %p.  Number: %Lu\n",
+			       (void *)sechdrs[i].sh_addr,
+			       sechdrs[i].sh_size / sizeof(Elf64_Rela));
+
+			/* Sort the relocation information based on a symbol and
+			 * addend key. This is a stable O(n*log n) complexity
+			 * alogrithm but it will reduce the complexity of
+			 * count_relocs() to linear complexity O(n)
+			 */
+			sort((void *)sechdrs[i].sh_addr,
+			     sechdrs[i].sh_size / sizeof(Elf64_Rela),
+			     sizeof(Elf64_Rela), relacmp, relaswap);
+
+			relocs += count_relocs((void *)sechdrs[i].sh_addr,
+					       sechdrs[i].sh_size
+					       / sizeof(Elf64_Rela));
+		}
+	}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	/* make the trampoline to the ftrace_caller */
+	relocs++;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	/* an additional one for ftrace_regs_caller */
+	relocs++;
+#endif
+#endif
+
+	pr_debug("Looks like a total of %lu stubs, max\n", relocs);
+	return relocs * sizeof(struct ppc64_stub_entry);
+}
+
+/* Still needed for ELFv2, for .TOC. */
+static void dedotify_versions(struct modversion_info *vers,
+			      unsigned long size)
+{
+	struct modversion_info *end;
+
+	for (end = (void *)vers + size; vers < end; vers++)
+		if (vers->name[0] == '.') {
+			memmove(vers->name, vers->name+1, strlen(vers->name));
+		}
+}
+
+/*
+ * Undefined symbols which refer to .funcname, hack to funcname. Make .TOC.
+ * seem to be defined (value set later).
+ */
+static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
+{
+	unsigned int i;
+
+	for (i = 1; i < numsyms; i++) {
+		if (syms[i].st_shndx == SHN_UNDEF) {
+			char *name = strtab + syms[i].st_name;
+			if (name[0] == '.') {
+				if (strcmp(name+1, "TOC.") == 0)
+					syms[i].st_shndx = SHN_ABS;
+				syms[i].st_name++;
+			}
+		}
+	}
+}
+
+static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs,
+			       const char *strtab,
+			       unsigned int symindex)
+{
+	unsigned int i, numsyms;
+	Elf64_Sym *syms;
+
+	syms = (Elf64_Sym *)sechdrs[symindex].sh_addr;
+	numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym);
+
+	for (i = 1; i < numsyms; i++) {
+		if (syms[i].st_shndx == SHN_ABS
+		    && strcmp(strtab + syms[i].st_name, "TOC.") == 0)
+			return &syms[i];
+	}
+	return NULL;
+}
+
+int module_frob_arch_sections(Elf64_Ehdr *hdr,
+			      Elf64_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *me)
+{
+	unsigned int i;
+
+	/* Find .toc and .stubs sections, symtab and strtab */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		char *p;
+		if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0)
+			me->arch.stubs_section = i;
+		else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0) {
+			me->arch.toc_section = i;
+			if (sechdrs[i].sh_addralign < 8)
+				sechdrs[i].sh_addralign = 8;
+		}
+		else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0)
+			dedotify_versions((void *)hdr + sechdrs[i].sh_offset,
+					  sechdrs[i].sh_size);
+
+		/* We don't handle .init for the moment: rename to _init */
+		while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init")))
+			p[0] = '_';
+
+		if (sechdrs[i].sh_type == SHT_SYMTAB)
+			dedotify((void *)hdr + sechdrs[i].sh_offset,
+				 sechdrs[i].sh_size / sizeof(Elf64_Sym),
+				 (void *)hdr
+				 + sechdrs[sechdrs[i].sh_link].sh_offset);
+	}
+
+	if (!me->arch.stubs_section) {
+		pr_err("%s: doesn't contain .stubs.\n", me->name);
+		return -ENOEXEC;
+	}
+
+	/* If we don't have a .toc, just use .stubs.  We need to set r2
+	   to some reasonable value in case the module calls out to
+	   other functions via a stub, or if a function pointer escapes
+	   the module by some means.  */
+	if (!me->arch.toc_section)
+		me->arch.toc_section = me->arch.stubs_section;
+
+	/* Override the stubs size */
+	sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs);
+	return 0;
+}
+
+/*
+ * r2 is the TOC pointer: it actually points 0x8000 into the TOC (this gives the
+ * value maximum span in an instruction which uses a signed offset). Round down
+ * to a 256 byte boundary for the odd case where we are setting up r2 without a
+ * .toc section.
+ */
+static inline unsigned long my_r2(const Elf64_Shdr *sechdrs, struct module *me)
+{
+	return (sechdrs[me->arch.toc_section].sh_addr & ~0xfful) + 0x8000;
+}
+
+/* Both low and high 16 bits are added as SIGNED additions, so if low
+   16 bits has high bit set, high 16 bits must be adjusted.  These
+   macros do that (stolen from binutils). */
+#define PPC_LO(v) ((v) & 0xffff)
+#define PPC_HI(v) (((v) >> 16) & 0xffff)
+#define PPC_HA(v) PPC_HI ((v) + 0x8000)
+
+/* Patch stub to reference function and correct r2 value. */
+static inline int create_stub(const Elf64_Shdr *sechdrs,
+			      struct ppc64_stub_entry *entry,
+			      unsigned long addr,
+			      struct module *me)
+{
+	long reladdr;
+
+	memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns));
+
+	/* Stub uses address relative to r2. */
+	reladdr = (unsigned long)entry - my_r2(sechdrs, me);
+	if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) {
+		pr_err("%s: Address %p of stub out of range of %p.\n",
+		       me->name, (void *)reladdr, (void *)my_r2);
+		return 0;
+	}
+	pr_debug("Stub %p get data from reladdr %li\n", entry, reladdr);
+
+	entry->jump[0] |= PPC_HA(reladdr);
+	entry->jump[1] |= PPC_LO(reladdr);
+	entry->funcdata = func_desc(addr);
+	entry->magic = STUB_MAGIC;
+
+	return 1;
+}
+
+/* Create stub to jump to function described in this OPD/ptr: we need the
+   stub to set up the TOC ptr (r2) for the function. */
+static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
+				   unsigned long addr,
+				   struct module *me)
+{
+	struct ppc64_stub_entry *stubs;
+	unsigned int i, num_stubs;
+
+	num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs);
+
+	/* Find this stub, or if that fails, the next avail. entry */
+	stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
+	for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
+		if (WARN_ON(i >= num_stubs))
+			return 0;
+
+		if (stub_func_addr(stubs[i].funcdata) == func_addr(addr))
+			return (unsigned long)&stubs[i];
+	}
+
+	if (!create_stub(sechdrs, &stubs[i], addr, me))
+		return 0;
+
+	return (unsigned long)&stubs[i];
+}
+
+#ifdef CONFIG_MPROFILE_KERNEL
+static bool is_mprofile_mcount_callsite(const char *name, u32 *instruction)
+{
+	if (strcmp("_mcount", name))
+		return false;
+
+	/*
+	 * Check if this is one of the -mprofile-kernel sequences.
+	 */
+	if (instruction[-1] == PPC_INST_STD_LR &&
+	    instruction[-2] == PPC_INST_MFLR)
+		return true;
+
+	if (instruction[-1] == PPC_INST_MFLR)
+		return true;
+
+	return false;
+}
+
+/*
+ * In case of _mcount calls, do not save the current callee's TOC (in r2) into
+ * the original caller's stack frame. If we did we would clobber the saved TOC
+ * value of the original caller.
+ */
+static void squash_toc_save_inst(const char *name, unsigned long addr)
+{
+	struct ppc64_stub_entry *stub = (struct ppc64_stub_entry *)addr;
+
+	/* Only for calls to _mcount */
+	if (strcmp("_mcount", name) != 0)
+		return;
+
+	stub->jump[2] = PPC_INST_NOP;
+}
+#else
+static void squash_toc_save_inst(const char *name, unsigned long addr) { }
+
+static bool is_mprofile_mcount_callsite(const char *name, u32 *instruction)
+{
+	return false;
+}
+#endif
+
+/* We expect a noop next: if it is, replace it with instruction to
+   restore r2. */
+static int restore_r2(const char *name, u32 *instruction, struct module *me)
+{
+	u32 *prev_insn = instruction - 1;
+
+	if (is_mprofile_mcount_callsite(name, prev_insn))
+		return 1;
+
+	/*
+	 * Make sure the branch isn't a sibling call.  Sibling calls aren't
+	 * "link" branches and they don't return, so they don't need the r2
+	 * restore afterwards.
+	 */
+	if (!instr_is_relative_link_branch(*prev_insn))
+		return 1;
+
+	if (*instruction != PPC_INST_NOP) {
+		pr_err("%s: Expected nop after call, got %08x at %pS\n",
+			me->name, *instruction, instruction);
+		return 0;
+	}
+	/* ld r2,R2_STACK_OFFSET(r1) */
+	*instruction = PPC_INST_LD_TOC;
+	return 1;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+		       const char *strtab,
+		       unsigned int symindex,
+		       unsigned int relsec,
+		       struct module *me)
+{
+	unsigned int i;
+	Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr;
+	Elf64_Sym *sym;
+	unsigned long *location;
+	unsigned long value;
+
+	pr_debug("Applying ADD relocate section %u to %u\n", relsec,
+	       sechdrs[relsec].sh_info);
+
+	/* First time we're called, we can fix up .TOC. */
+	if (!me->arch.toc_fixed) {
+		sym = find_dot_toc(sechdrs, strtab, symindex);
+		/* It's theoretically possible that a module doesn't want a
+		 * .TOC. so don't fail it just for that. */
+		if (sym)
+			sym->st_value = my_r2(sechdrs, me);
+		me->arch.toc_fixed = true;
+	}
+
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) {
+		/* This is where to make the change */
+		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rela[i].r_offset;
+		/* This is the symbol it is referring to */
+		sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+			+ ELF64_R_SYM(rela[i].r_info);
+
+		pr_debug("RELOC at %p: %li-type as %s (0x%lx) + %li\n",
+		       location, (long)ELF64_R_TYPE(rela[i].r_info),
+		       strtab + sym->st_name, (unsigned long)sym->st_value,
+		       (long)rela[i].r_addend);
+
+		/* `Everything is relative'. */
+		value = sym->st_value + rela[i].r_addend;
+
+		switch (ELF64_R_TYPE(rela[i].r_info)) {
+		case R_PPC64_ADDR32:
+			/* Simply set it */
+			*(u32 *)location = value;
+			break;
+
+		case R_PPC64_ADDR64:
+			/* Simply set it */
+			*(unsigned long *)location = value;
+			break;
+
+		case R_PPC64_TOC:
+			*(unsigned long *)location = my_r2(sechdrs, me);
+			break;
+
+		case R_PPC64_TOC16:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if (value + 0x8000 > 0xffff) {
+				pr_err("%s: bad TOC16 relocation (0x%lx)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		case R_PPC64_TOC16_LO:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		case R_PPC64_TOC16_DS:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if ((value & 3) != 0 || value + 0x8000 > 0xffff) {
+				pr_err("%s: bad TOC16_DS relocation (0x%lx)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xfffc)
+				| (value & 0xfffc);
+			break;
+
+		case R_PPC64_TOC16_LO_DS:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			if ((value & 3) != 0) {
+				pr_err("%s: bad TOC16_LO_DS relocation (0x%lx)\n",
+				       me->name, value);
+				return -ENOEXEC;
+			}
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xfffc)
+				| (value & 0xfffc);
+			break;
+
+		case R_PPC64_TOC16_HA:
+			/* Subtract TOC pointer */
+			value -= my_r2(sechdrs, me);
+			value = ((value + 0x8000) >> 16);
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		case R_PPC_REL24:
+			/* FIXME: Handle weak symbols here --RR */
+			if (sym->st_shndx == SHN_UNDEF ||
+			    sym->st_shndx == SHN_LIVEPATCH) {
+				/* External: go via stub */
+				value = stub_for_addr(sechdrs, value, me);
+				if (!value)
+					return -ENOENT;
+				if (!restore_r2(strtab + sym->st_name,
+							(u32 *)location + 1, me))
+					return -ENOEXEC;
+
+				squash_toc_save_inst(strtab + sym->st_name, value);
+			} else
+				value += local_entry_offset(sym);
+
+			/* Convert value to relative */
+			value -= (unsigned long)location;
+			if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){
+				pr_err("%s: REL24 %li out of range!\n",
+				       me->name, (long int)value);
+				return -ENOEXEC;
+			}
+
+			/* Only replace bits 2 through 26 */
+			*(uint32_t *)location
+				= (*(uint32_t *)location & ~0x03fffffc)
+				| (value & 0x03fffffc);
+			break;
+
+		case R_PPC64_REL64:
+			/* 64 bits relative (used by features fixups) */
+			*location = value - (unsigned long)location;
+			break;
+
+		case R_PPC64_REL32:
+			/* 32 bits relative (used by relative exception tables) */
+			/* Convert value to relative */
+			value -= (unsigned long)location;
+			if (value + 0x80000000 > 0xffffffff) {
+				pr_err("%s: REL32 %li out of range!\n",
+				       me->name, (long int)value);
+				return -ENOEXEC;
+			}
+			*(u32 *)location = value;
+			break;
+
+		case R_PPC64_TOCSAVE:
+			/*
+			 * Marker reloc indicates we don't have to save r2.
+			 * That would only save us one instruction, so ignore
+			 * it.
+			 */
+			break;
+
+		case R_PPC64_ENTRY:
+			/*
+			 * Optimize ELFv2 large code model entry point if
+			 * the TOC is within 2GB range of current location.
+			 */
+			value = my_r2(sechdrs, me) - (unsigned long)location;
+			if (value + 0x80008000 > 0xffffffff)
+				break;
+			/*
+			 * Check for the large code model prolog sequence:
+		         *	ld r2, ...(r12)
+			 *	add r2, r2, r12
+			 */
+			if ((((uint32_t *)location)[0] & ~0xfffc)
+			    != 0xe84c0000)
+				break;
+			if (((uint32_t *)location)[1] != 0x7c426214)
+				break;
+			/*
+			 * If found, replace it with:
+			 *	addis r2, r12, (.TOC.-func)@ha
+			 *	addi  r2,  r2, (.TOC.-func)@l
+			 */
+			((uint32_t *)location)[0] = 0x3c4c0000 + PPC_HA(value);
+			((uint32_t *)location)[1] = 0x38420000 + PPC_LO(value);
+			break;
+
+		case R_PPC64_REL16_HA:
+			/* Subtract location pointer */
+			value -= (unsigned long)location;
+			value = ((value + 0x8000) >> 16);
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		case R_PPC64_REL16_LO:
+			/* Subtract location pointer */
+			value -= (unsigned long)location;
+			*((uint16_t *) location)
+				= (*((uint16_t *) location) & ~0xffff)
+				| (value & 0xffff);
+			break;
+
+		default:
+			pr_err("%s: Unknown ADD relocation: %lu\n",
+			       me->name,
+			       (unsigned long)ELF64_R_TYPE(rela[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#ifdef CONFIG_MPROFILE_KERNEL
+
+#define PACATOC offsetof(struct paca_struct, kernel_toc)
+
+/*
+ * For mprofile-kernel we use a special stub for ftrace_caller() because we
+ * can't rely on r2 containing this module's TOC when we enter the stub.
+ *
+ * That can happen if the function calling us didn't need to use the toc. In
+ * that case it won't have setup r2, and the r2 value will be either the
+ * kernel's toc, or possibly another modules toc.
+ *
+ * To deal with that this stub uses the kernel toc, which is always accessible
+ * via the paca (in r13). The target (ftrace_caller()) is responsible for
+ * saving and restoring the toc before returning.
+ */
+static unsigned long create_ftrace_stub(const Elf64_Shdr *sechdrs,
+				struct module *me, unsigned long addr)
+{
+	struct ppc64_stub_entry *entry;
+	unsigned int i, num_stubs;
+	static u32 stub_insns[] = {
+		0xe98d0000 | PACATOC, 	/* ld      r12,PACATOC(r13)	*/
+		0x3d8c0000,		/* addis   r12,r12,<high>	*/
+		0x398c0000, 		/* addi    r12,r12,<low>	*/
+		0x7d8903a6, 		/* mtctr   r12			*/
+		0x4e800420, 		/* bctr				*/
+	};
+	long reladdr;
+
+	num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*entry);
+
+	/* Find the next available stub entry */
+	entry = (void *)sechdrs[me->arch.stubs_section].sh_addr;
+	for (i = 0; i < num_stubs && stub_func_addr(entry->funcdata); i++, entry++);
+
+	if (i >= num_stubs) {
+		pr_err("%s: Unable to find a free slot for ftrace stub.\n", me->name);
+		return 0;
+	}
+
+	memcpy(entry->jump, stub_insns, sizeof(stub_insns));
+
+	/* Stub uses address relative to kernel toc (from the paca) */
+	reladdr = addr - kernel_toc_addr();
+	if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) {
+		pr_err("%s: Address of %ps out of range of kernel_toc.\n",
+							me->name, (void *)addr);
+		return 0;
+	}
+
+	entry->jump[1] |= PPC_HA(reladdr);
+	entry->jump[2] |= PPC_LO(reladdr);
+
+	/* Eventhough we don't use funcdata in the stub, it's needed elsewhere. */
+	entry->funcdata = func_desc(addr);
+	entry->magic = STUB_MAGIC;
+
+	return (unsigned long)entry;
+}
+#else
+static unsigned long create_ftrace_stub(const Elf64_Shdr *sechdrs,
+				struct module *me, unsigned long addr)
+{
+	return stub_for_addr(sechdrs, addr, me);
+}
+#endif
+
+int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs)
+{
+	mod->arch.tramp = create_ftrace_stub(sechdrs, mod,
+					(unsigned long)ftrace_caller);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	mod->arch.tramp_regs = create_ftrace_stub(sechdrs, mod,
+					(unsigned long)ftrace_regs_caller);
+	if (!mod->arch.tramp_regs)
+		return -ENOENT;
+#endif
+
+	if (!mod->arch.tramp)
+		return -ENOENT;
+
+	return 0;
+}
+#endif
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
new file mode 100644
index 000000000..f2197654b
--- /dev/null
+++ b/arch/powerpc/kernel/msi.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2006-2007, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+
+#include <asm/machdep.h>
+
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (!phb->controller_ops.setup_msi_irqs ||
+	    !phb->controller_ops.teardown_msi_irqs) {
+		pr_debug("msi: Platform doesn't provide MSI callbacks.\n");
+		return -ENOSYS;
+	}
+
+	/* PowerPC doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
+	return phb->controller_ops.setup_msi_irqs(dev, nvec, type);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	/*
+	 * We can be called even when arch_setup_msi_irqs() returns -ENOSYS,
+	 * so check the pointer again.
+	 */
+	if (phb->controller_ops.teardown_msi_irqs)
+		phb->controller_ops.teardown_msi_irqs(dev);
+}
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
new file mode 100644
index 000000000..e7d4ce696
--- /dev/null
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -0,0 +1,1212 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ *
+ * TODO: Split the /dev/nvram part (that one can use
+ *       drivers/char/generic_nvram.c) from the arch & partition
+ *       parsing code.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/fcntl.h>
+#include <linux/nvram.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/kmsg_dump.h>
+#include <linux/pagemap.h>
+#include <linux/pstore.h>
+#include <linux/zlib.h>
+#include <linux/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+#undef DEBUG_NVRAM
+
+#define NVRAM_HEADER_LEN	sizeof(struct nvram_header)
+#define NVRAM_BLOCK_LEN		NVRAM_HEADER_LEN
+
+/* If change this size, then change the size of NVNAME_LEN */
+struct nvram_header {
+	unsigned char signature;
+	unsigned char checksum;
+	unsigned short length;
+	/* Terminating null required only for names < 12 chars. */
+	char name[12];
+};
+
+struct nvram_partition {
+	struct list_head partition;
+	struct nvram_header header;
+	unsigned int index;
+};
+
+static LIST_HEAD(nvram_partitions);
+
+#ifdef CONFIG_PPC_PSERIES
+struct nvram_os_partition rtas_log_partition = {
+	.name = "ibm,rtas-log",
+	.req_size = 2079,
+	.min_size = 1055,
+	.index = -1,
+	.os_partition = true
+};
+#endif
+
+struct nvram_os_partition oops_log_partition = {
+	.name = "lnx,oops-log",
+	.req_size = 4000,
+	.min_size = 2000,
+	.index = -1,
+	.os_partition = true
+};
+
+static const char *nvram_os_partitions[] = {
+#ifdef CONFIG_PPC_PSERIES
+	"ibm,rtas-log",
+#endif
+	"lnx,oops-log",
+	NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+			  enum kmsg_dump_reason reason);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+	.dump = oops_to_nvram
+};
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a oops header.
+ * oops header has u16 holding the version of oops header (to differentiate
+ * between old and new format header) followed by u16 holding the length of
+ * the compressed* text (*Or uncompressed, if compression fails.) and u64
+ * holding the timestamp. oops_buf[] gets written to NVRAM.
+ *
+ * oops_log_info points to the header. oops_data points to the compressed text.
+ *
+ * +- oops_buf
+ * |                                   +- oops_data
+ * v                                   v
+ * +-----------+-----------+-----------+------------------------+
+ * | version   | length    | timestamp | text                   |
+ * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes)   |
+ * +-----------+-----------+-----------+------------------------+
+ * ^
+ * +- oops_log_info
+ *
+ * We preallocate these buffers during init to avoid kmalloc during oops/panic.
+ */
+static size_t big_oops_buf_sz;
+static char *big_oops_buf, *oops_buf;
+static char *oops_data;
+static size_t oops_data_sz;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
+
+#ifdef CONFIG_PSTORE
+#ifdef CONFIG_PPC_POWERNV
+static struct nvram_os_partition skiboot_partition = {
+	.name = "ibm,skiboot",
+	.index = -1,
+	.os_partition = false
+};
+#endif
+
+#ifdef CONFIG_PPC_PSERIES
+static struct nvram_os_partition of_config_partition = {
+	.name = "of-config",
+	.index = -1,
+	.os_partition = false
+};
+#endif
+
+static struct nvram_os_partition common_partition = {
+	.name = "common",
+	.index = -1,
+	.os_partition = false
+};
+
+static enum pstore_type_id nvram_type_ids[] = {
+	PSTORE_TYPE_DMESG,
+	PSTORE_TYPE_PPC_COMMON,
+	-1,
+	-1,
+	-1
+};
+static int read_type;
+#endif
+
+/* nvram_write_os_partition
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode.  If we have a severe error there
+ * is no way to guarantee that the OS or the machine is in a state to
+ * get back to user land and write the error to disk.  For example if
+ * the SCSI device driver causes a Machine Check by writing to a bad
+ * IO address, there is no way of guaranteeing that the device driver
+ * is in any state that is would also be able to write the error data
+ * captured to disk, thus we buffer it in NVRAM for analysis on the
+ * next boot.
+ *
+ * In NVRAM the partition containing the error log buffer will looks like:
+ * Header (in bytes):
+ * +-----------+----------+--------+------------+------------------+
+ * | signature | checksum | length | name       | data             |
+ * |0          |1         |2      3|4         15|16        length-1|
+ * +-----------+----------+--------+------------+------------------+
+ *
+ * The 'data' section would look like (in bytes):
+ * +--------------+------------+-----------------------------------+
+ * | event_logged | sequence # | error log                         |
+ * |0            3|4          7|8                  error_log_size-1|
+ * +--------------+------------+-----------------------------------+
+ *
+ * event_logged: 0 if event has not been logged to syslog, 1 if it has
+ * sequence #: The unique sequence # for each event. (until it wraps)
+ * error log: The error log from event_scan
+ */
+int nvram_write_os_partition(struct nvram_os_partition *part,
+			     char *buff, int length,
+			     unsigned int err_type,
+			     unsigned int error_log_cnt)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+
+	if (part->index == -1)
+		return -ESPIPE;
+
+	if (length > part->size)
+		length = part->size;
+
+	info.error_type = cpu_to_be32(err_type);
+	info.seq_num = cpu_to_be32(error_log_cnt);
+
+	tmp_index = part->index;
+
+	rc = ppc_md.nvram_write((char *)&info, sizeof(info), &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_write (%d)\n", __func__, rc);
+		return rc;
+	}
+
+	rc = ppc_md.nvram_write(buff, length, &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_write (%d)\n", __func__, rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* nvram_read_partition
+ *
+ * Reads nvram partition for at most 'length'
+ */
+int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+			 int length, unsigned int *err_type,
+			 unsigned int *error_log_cnt)
+{
+	int rc;
+	loff_t tmp_index;
+	struct err_log_info info;
+
+	if (part->index == -1)
+		return -1;
+
+	if (length > part->size)
+		length = part->size;
+
+	tmp_index = part->index;
+
+	if (part->os_partition) {
+		rc = ppc_md.nvram_read((char *)&info, sizeof(info), &tmp_index);
+		if (rc <= 0) {
+			pr_err("%s: Failed nvram_read (%d)\n", __func__, rc);
+			return rc;
+		}
+	}
+
+	rc = ppc_md.nvram_read(buff, length, &tmp_index);
+	if (rc <= 0) {
+		pr_err("%s: Failed nvram_read (%d)\n", __func__, rc);
+		return rc;
+	}
+
+	if (part->os_partition) {
+		*error_log_cnt = be32_to_cpu(info.seq_num);
+		*err_type = be32_to_cpu(info.error_type);
+	}
+
+	return 0;
+}
+
+/* nvram_init_os_partition
+ *
+ * This sets up a partition with an "OS" signature.
+ *
+ * The general strategy is the following:
+ * 1.) If a partition with the indicated name already exists...
+ *	- If it's large enough, use it.
+ *	- Otherwise, recycle it and keep going.
+ * 2.) Search for a free partition that is large enough.
+ * 3.) If there's not a free partition large enough, recycle any obsolete
+ * OS partitions and try again.
+ * 4.) Will first try getting a chunk that will satisfy the requested size.
+ * 5.) If a chunk of the requested size cannot be allocated, then try finding
+ * a chunk that will satisfy the minum needed.
+ *
+ * Returns 0 on success, else -1.
+ */
+int __init nvram_init_os_partition(struct nvram_os_partition *part)
+{
+	loff_t p;
+	int size;
+
+	/* Look for ours */
+	p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
+
+	/* Found one but too small, remove it */
+	if (p && size < part->min_size) {
+		pr_info("nvram: Found too small %s partition,"
+					" removing it...\n", part->name);
+		nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
+		p = 0;
+	}
+
+	/* Create one if we didn't find */
+	if (!p) {
+		p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
+		if (p == -ENOSPC) {
+			pr_info("nvram: No room to create %s partition, "
+				"deleting any obsolete OS partitions...\n",
+				part->name);
+			nvram_remove_partition(NULL, NVRAM_SIG_OS,
+					nvram_os_partitions);
+			p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+					part->req_size, part->min_size);
+		}
+	}
+
+	if (p <= 0) {
+		pr_err("nvram: Failed to find or create %s"
+		       " partition, err %d\n", part->name, (int)p);
+		return -1;
+	}
+
+	part->index = p;
+	part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
+
+	return 0;
+}
+
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+							size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+						MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+								oops_data_sz);
+	if (zipped_len < 0) {
+		pr_err("nvram: compression failed; returned %d\n", zipped_len);
+		pr_err("nvram: logging uncompressed oops/panic report\n");
+		return -1;
+	}
+	oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+	oops_hdr->report_length = cpu_to_be16(zipped_len);
+	oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
+	return 0;
+}
+
+#ifdef CONFIG_PSTORE
+static int nvram_pstore_open(struct pstore_info *psi)
+{
+	/* Reset the iterator to start reading partitions again */
+	read_type = -1;
+	return 0;
+}
+
+/**
+ * nvram_pstore_write - pstore write callback for nvram
+ * @record:             pstore record to write, with @id to be set
+ *
+ * Called by pstore_dump() when an oops or panic report is logged in the
+ * printk buffer.
+ * Returns 0 on successful write.
+ */
+static int nvram_pstore_write(struct pstore_record *record)
+{
+	int rc;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
+	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
+
+	/* part 1 has the recent messages from printk buffer */
+	if (record->part > 1 || (record->type != PSTORE_TYPE_DMESG))
+		return -1;
+
+	if (clobbering_unread_rtas_event())
+		return -1;
+
+	oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+	oops_hdr->report_length = cpu_to_be16(record->size);
+	oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
+
+	if (record->compressed)
+		err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+
+	rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) (sizeof(*oops_hdr) + record->size), err_type,
+		record->count);
+
+	if (rc != 0)
+		return rc;
+
+	record->id = record->part;
+	return 0;
+}
+
+/*
+ * Reads the oops/panic report, rtas, of-config and common partition.
+ * Returns the length of the data we read from each partition.
+ * Returns 0 if we've been called before.
+ */
+static ssize_t nvram_pstore_read(struct pstore_record *record)
+{
+	struct oops_log_info *oops_hdr;
+	unsigned int err_type, id_no, size = 0;
+	struct nvram_os_partition *part = NULL;
+	char *buff = NULL;
+	int sig = 0;
+	loff_t p;
+
+	read_type++;
+
+	switch (nvram_type_ids[read_type]) {
+	case PSTORE_TYPE_DMESG:
+		part = &oops_log_partition;
+		record->type = PSTORE_TYPE_DMESG;
+		break;
+	case PSTORE_TYPE_PPC_COMMON:
+		sig = NVRAM_SIG_SYS;
+		part = &common_partition;
+		record->type = PSTORE_TYPE_PPC_COMMON;
+		record->id = PSTORE_TYPE_PPC_COMMON;
+		record->time.tv_sec = 0;
+		record->time.tv_nsec = 0;
+		break;
+#ifdef CONFIG_PPC_PSERIES
+	case PSTORE_TYPE_PPC_RTAS:
+		part = &rtas_log_partition;
+		record->type = PSTORE_TYPE_PPC_RTAS;
+		record->time.tv_sec = last_rtas_event;
+		record->time.tv_nsec = 0;
+		break;
+	case PSTORE_TYPE_PPC_OF:
+		sig = NVRAM_SIG_OF;
+		part = &of_config_partition;
+		record->type = PSTORE_TYPE_PPC_OF;
+		record->id = PSTORE_TYPE_PPC_OF;
+		record->time.tv_sec = 0;
+		record->time.tv_nsec = 0;
+		break;
+#endif
+#ifdef CONFIG_PPC_POWERNV
+	case PSTORE_TYPE_PPC_OPAL:
+		sig = NVRAM_SIG_FW;
+		part = &skiboot_partition;
+		record->type = PSTORE_TYPE_PPC_OPAL;
+		record->id = PSTORE_TYPE_PPC_OPAL;
+		record->time.tv_sec = 0;
+		record->time.tv_nsec = 0;
+		break;
+#endif
+	default:
+		return 0;
+	}
+
+	if (!part->os_partition) {
+		p = nvram_find_partition(part->name, sig, &size);
+		if (p <= 0) {
+			pr_err("nvram: Failed to find partition %s, "
+				"err %d\n", part->name, (int)p);
+			return 0;
+		}
+		part->index = p;
+		part->size = size;
+	}
+
+	buff = kmalloc(part->size, GFP_KERNEL);
+
+	if (!buff)
+		return -ENOMEM;
+
+	if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
+		kfree(buff);
+		return 0;
+	}
+
+	record->count = 0;
+
+	if (part->os_partition)
+		record->id = id_no;
+
+	if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
+		size_t length, hdr_size;
+
+		oops_hdr = (struct oops_log_info *)buff;
+		if (be16_to_cpu(oops_hdr->version) < OOPS_HDR_VERSION) {
+			/* Old format oops header had 2-byte record size */
+			hdr_size = sizeof(u16);
+			length = be16_to_cpu(oops_hdr->version);
+			record->time.tv_sec = 0;
+			record->time.tv_nsec = 0;
+		} else {
+			hdr_size = sizeof(*oops_hdr);
+			length = be16_to_cpu(oops_hdr->report_length);
+			record->time.tv_sec = be64_to_cpu(oops_hdr->timestamp);
+			record->time.tv_nsec = 0;
+		}
+		record->buf = kmemdup(buff + hdr_size, length, GFP_KERNEL);
+		kfree(buff);
+		if (record->buf == NULL)
+			return -ENOMEM;
+
+		record->ecc_notice_size = 0;
+		if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
+			record->compressed = true;
+		else
+			record->compressed = false;
+		return length;
+	}
+
+	record->buf = buff;
+	return part->size;
+}
+
+static struct pstore_info nvram_pstore_info = {
+	.owner = THIS_MODULE,
+	.name = "nvram",
+	.flags = PSTORE_FLAGS_DMESG,
+	.open = nvram_pstore_open,
+	.read = nvram_pstore_read,
+	.write = nvram_pstore_write,
+};
+
+static int nvram_pstore_init(void)
+{
+	int rc = 0;
+
+	if (machine_is(pseries)) {
+		nvram_type_ids[2] = PSTORE_TYPE_PPC_RTAS;
+		nvram_type_ids[3] = PSTORE_TYPE_PPC_OF;
+	} else
+		nvram_type_ids[2] = PSTORE_TYPE_PPC_OPAL;
+
+	nvram_pstore_info.buf = oops_data;
+	nvram_pstore_info.bufsize = oops_data_sz;
+
+	rc = pstore_register(&nvram_pstore_info);
+	if (rc && (rc != -EPERM))
+		/* Print error only when pstore.backend == nvram */
+		pr_err("nvram: pstore_register() failed, returned %d. "
+				"Defaults to kmsg_dump\n", rc);
+
+	return rc;
+}
+#else
+static int nvram_pstore_init(void)
+{
+	return -1;
+}
+#endif
+
+void __init nvram_init_oops_partition(int rtas_partition_exists)
+{
+	int rc;
+
+	rc = nvram_init_os_partition(&oops_log_partition);
+	if (rc != 0) {
+#ifdef CONFIG_PPC_PSERIES
+		if (!rtas_partition_exists) {
+			pr_err("nvram: Failed to initialize oops partition!");
+			return;
+		}
+		pr_notice("nvram: Using %s partition to log both"
+			" RTAS errors and oops/panic reports\n",
+			rtas_log_partition.name);
+		memcpy(&oops_log_partition, &rtas_log_partition,
+						sizeof(rtas_log_partition));
+#else
+		pr_err("nvram: Failed to initialize oops partition!");
+		return;
+#endif
+	}
+	oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+	if (!oops_buf) {
+		pr_err("nvram: No memory for %s partition\n",
+						oops_log_partition.name);
+		return;
+	}
+	oops_data = oops_buf + sizeof(struct oops_log_info);
+	oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
+
+	rc = nvram_pstore_init();
+
+	if (!rc)
+		return;
+
+	/*
+	 * Figure compression (preceded by elimination of each line's <n>
+	 * severity prefix) will reduce the oops/panic report to at most
+	 * 45% of its original size.
+	 */
+	big_oops_buf_sz = (oops_data_sz * 100) / 45;
+	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+	if (big_oops_buf) {
+		stream.workspace =  kmalloc(zlib_deflate_workspacesize(
+					WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
+		if (!stream.workspace) {
+			pr_err("nvram: No memory for compression workspace; "
+				"skipping compression of %s partition data\n",
+				oops_log_partition.name);
+			kfree(big_oops_buf);
+			big_oops_buf = NULL;
+		}
+	} else {
+		pr_err("No memory for uncompressed %s data; "
+			"skipping compression\n", oops_log_partition.name);
+		stream.workspace = NULL;
+	}
+
+	rc = kmsg_dump_register(&nvram_kmsg_dumper);
+	if (rc != 0) {
+		pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
+		kfree(oops_buf);
+		kfree(big_oops_buf);
+		kfree(stream.workspace);
+	}
+}
+
+/*
+ * This is our kmsg_dump callback, called after an oops or panic report
+ * has been written to the printk buffer.  We want to capture as much
+ * of the printk buffer as possible.  First, capture as much as we can
+ * that we think will compress sufficiently to fit in the lnx,oops-log
+ * partition.  If that's too much, go back and capture uncompressed text.
+ */
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+			  enum kmsg_dump_reason reason)
+{
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+	static unsigned int oops_count = 0;
+	static bool panicking = false;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long flags;
+	size_t text_len;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+	int rc = -1;
+
+	switch (reason) {
+	case KMSG_DUMP_RESTART:
+	case KMSG_DUMP_HALT:
+	case KMSG_DUMP_POWEROFF:
+		/* These are almost always orderly shutdowns. */
+		return;
+	case KMSG_DUMP_OOPS:
+		break;
+	case KMSG_DUMP_PANIC:
+		panicking = true;
+		break;
+	case KMSG_DUMP_EMERG:
+		if (panicking)
+			/* Panic report already captured. */
+			return;
+		break;
+	default:
+		pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n",
+		       __func__, (int) reason);
+		return;
+	}
+
+	if (clobbering_unread_rtas_event())
+		return;
+
+	if (!spin_trylock_irqsave(&lock, flags))
+		return;
+
+	if (big_oops_buf) {
+		kmsg_dump_get_buffer(dumper, false,
+				     big_oops_buf, big_oops_buf_sz, &text_len);
+		rc = zip_oops(text_len);
+	}
+	if (rc != 0) {
+		kmsg_dump_rewind(dumper);
+		kmsg_dump_get_buffer(dumper, false,
+				     oops_data, oops_data_sz, &text_len);
+		err_type = ERR_TYPE_KERNEL_PANIC;
+		oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
+		oops_hdr->report_length = cpu_to_be16(text_len);
+		oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
+	}
+
+	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) (sizeof(*oops_hdr) + text_len), err_type,
+		++oops_count);
+
+	spin_unlock_irqrestore(&lock, flags);
+}
+
+static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin)
+{
+	if (ppc_md.nvram_size == NULL)
+		return -ENODEV;
+	return generic_file_llseek_size(file, offset, origin, MAX_LFS_FILESIZE,
+					ppc_md.nvram_size());
+}
+
+
+static ssize_t dev_nvram_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	ssize_t ret;
+	char *tmp = NULL;
+	ssize_t size;
+
+	if (!ppc_md.nvram_size) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	size = ppc_md.nvram_size();
+	if (size < 0) {
+		ret = size;
+		goto out;
+	}
+
+	if (*ppos >= size) {
+		ret = 0;
+		goto out;
+	}
+
+	count = min_t(size_t, count, size - *ppos);
+	count = min(count, PAGE_SIZE);
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ppc_md.nvram_read(tmp, count, ppos);
+	if (ret <= 0)
+		goto out;
+
+	if (copy_to_user(buf, tmp, ret))
+		ret = -EFAULT;
+
+out:
+	kfree(tmp);
+	return ret;
+
+}
+
+static ssize_t dev_nvram_write(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	ssize_t ret;
+	char *tmp = NULL;
+	ssize_t size;
+
+	ret = -ENODEV;
+	if (!ppc_md.nvram_size)
+		goto out;
+
+	ret = 0;
+	size = ppc_md.nvram_size();
+	if (*ppos >= size || size < 0)
+		goto out;
+
+	count = min_t(size_t, count, size - *ppos);
+	count = min(count, PAGE_SIZE);
+
+	tmp = memdup_user(buf, count);
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto out;
+	}
+
+	ret = ppc_md.nvram_write(tmp, count, ppos);
+
+	kfree(tmp);
+out:
+	return ret;
+}
+
+static long dev_nvram_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	switch(cmd) {
+#ifdef CONFIG_PPC_PMAC
+	case OBSOLETE_PMAC_NVRAM_GET_OFFSET:
+		printk(KERN_WARNING "nvram: Using obsolete PMAC_NVRAM_GET_OFFSET ioctl\n");
+	case IOC_NVRAM_GET_OFFSET: {
+		int part, offset;
+
+		if (!machine_is(powermac))
+			return -EINVAL;
+		if (copy_from_user(&part, (void __user*)arg, sizeof(part)) != 0)
+			return -EFAULT;
+		if (part < pmac_nvram_OF || part > pmac_nvram_NR)
+			return -EINVAL;
+		offset = pmac_get_partition(part);
+		if (offset < 0)
+			return offset;
+		if (copy_to_user((void __user*)arg, &offset, sizeof(offset)) != 0)
+			return -EFAULT;
+		return 0;
+	}
+#endif /* CONFIG_PPC_PMAC */
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations nvram_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= dev_nvram_llseek,
+	.read		= dev_nvram_read,
+	.write		= dev_nvram_write,
+	.unlocked_ioctl	= dev_nvram_ioctl,
+};
+
+static struct miscdevice nvram_dev = {
+	NVRAM_MINOR,
+	"nvram",
+	&nvram_fops
+};
+
+
+#ifdef DEBUG_NVRAM
+static void __init nvram_print_partitions(char * label)
+{
+	struct nvram_partition * tmp_part;
+	
+	printk(KERN_WARNING "--------%s---------\n", label);
+	printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n");
+	list_for_each_entry(tmp_part, &nvram_partitions, partition) {
+		printk(KERN_WARNING "%4d    \t%02x\t%02x\t%d\t%12.12s\n",
+		       tmp_part->index, tmp_part->header.signature,
+		       tmp_part->header.checksum, tmp_part->header.length,
+		       tmp_part->header.name);
+	}
+}
+#endif
+
+
+static int __init nvram_write_header(struct nvram_partition * part)
+{
+	loff_t tmp_index;
+	int rc;
+	struct nvram_header phead;
+
+	memcpy(&phead, &part->header, NVRAM_HEADER_LEN);
+	phead.length = cpu_to_be16(phead.length);
+
+	tmp_index = part->index;
+	rc = ppc_md.nvram_write((char *)&phead, NVRAM_HEADER_LEN, &tmp_index);
+
+	return rc;
+}
+
+
+static unsigned char __init nvram_checksum(struct nvram_header *p)
+{
+	unsigned int c_sum, c_sum2;
+	unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */
+	c_sum = p->signature + p->length + sp[0] + sp[1] + sp[2] + sp[3] + sp[4] + sp[5];
+
+	/* The sum may have spilled into the 3rd byte.  Fold it back. */
+	c_sum = ((c_sum & 0xffff) + (c_sum >> 16)) & 0xffff;
+	/* The sum cannot exceed 2 bytes.  Fold it into a checksum */
+	c_sum2 = (c_sum >> 8) + (c_sum << 8);
+	c_sum = ((c_sum + c_sum2) >> 8) & 0xff;
+	return c_sum;
+}
+
+/*
+ * Per the criteria passed via nvram_remove_partition(), should this
+ * partition be removed?  1=remove, 0=keep
+ */
+static int nvram_can_remove_partition(struct nvram_partition *part,
+		const char *name, int sig, const char *exceptions[])
+{
+	if (part->header.signature != sig)
+		return 0;
+	if (name) {
+		if (strncmp(name, part->header.name, 12))
+			return 0;
+	} else if (exceptions) {
+		const char **except;
+		for (except = exceptions; *except; except++) {
+			if (!strncmp(*except, part->header.name, 12))
+				return 0;
+		}
+	}
+	return 1;
+}
+
+/**
+ * nvram_remove_partition - Remove one or more partitions in nvram
+ * @name: name of the partition to remove, or NULL for a
+ *        signature only match
+ * @sig: signature of the partition(s) to remove
+ * @exceptions: When removing all partitions with a matching signature,
+ *        leave these alone.
+ */
+
+int __init nvram_remove_partition(const char *name, int sig,
+						const char *exceptions[])
+{
+	struct nvram_partition *part, *prev, *tmp;
+	int rc;
+
+	list_for_each_entry(part, &nvram_partitions, partition) {
+		if (!nvram_can_remove_partition(part, name, sig, exceptions))
+			continue;
+
+		/* Make partition a free partition */
+		part->header.signature = NVRAM_SIG_FREE;
+		memset(part->header.name, 'w', 12);
+		part->header.checksum = nvram_checksum(&part->header);
+		rc = nvram_write_header(part);
+		if (rc <= 0) {
+			printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc);
+			return rc;
+		}
+	}
+
+	/* Merge contiguous ones */
+	prev = NULL;
+	list_for_each_entry_safe(part, tmp, &nvram_partitions, partition) {
+		if (part->header.signature != NVRAM_SIG_FREE) {
+			prev = NULL;
+			continue;
+		}
+		if (prev) {
+			prev->header.length += part->header.length;
+			prev->header.checksum = nvram_checksum(&prev->header);
+			rc = nvram_write_header(prev);
+			if (rc <= 0) {
+				printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc);
+				return rc;
+			}
+			list_del(&part->partition);
+			kfree(part);
+		} else
+			prev = part;
+	}
+	
+	return 0;
+}
+
+/**
+ * nvram_create_partition - Create a partition in nvram
+ * @name: name of the partition to create
+ * @sig: signature of the partition to create
+ * @req_size: size of data to allocate in bytes
+ * @min_size: minimum acceptable size (0 means req_size)
+ *
+ * Returns a negative error code or a positive nvram index
+ * of the beginning of the data area of the newly created
+ * partition. If you provided a min_size smaller than req_size
+ * you need to query for the actual size yourself after the
+ * call using nvram_partition_get_size().
+ */
+loff_t __init nvram_create_partition(const char *name, int sig,
+				     int req_size, int min_size)
+{
+	struct nvram_partition *part;
+	struct nvram_partition *new_part;
+	struct nvram_partition *free_part = NULL;
+	static char nv_init_vals[16];
+	loff_t tmp_index;
+	long size = 0;
+	int rc;
+
+	/* Convert sizes from bytes to blocks */
+	req_size = _ALIGN_UP(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN;
+	min_size = _ALIGN_UP(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN;
+
+	/* If no minimum size specified, make it the same as the
+	 * requested size
+	 */
+	if (min_size == 0)
+		min_size = req_size;
+	if (min_size > req_size)
+		return -EINVAL;
+
+	/* Now add one block to each for the header */
+	req_size += 1;
+	min_size += 1;
+
+	/* Find a free partition that will give us the maximum needed size 
+	   If can't find one that will give us the minimum size needed */
+	list_for_each_entry(part, &nvram_partitions, partition) {
+		if (part->header.signature != NVRAM_SIG_FREE)
+			continue;
+
+		if (part->header.length >= req_size) {
+			size = req_size;
+			free_part = part;
+			break;
+		}
+		if (part->header.length > size &&
+		    part->header.length >= min_size) {
+			size = part->header.length;
+			free_part = part;
+		}
+	}
+	if (!size)
+		return -ENOSPC;
+	
+	/* Create our OS partition */
+	new_part = kzalloc(sizeof(*new_part), GFP_KERNEL);
+	if (!new_part) {
+		pr_err("%s: kmalloc failed\n", __func__);
+		return -ENOMEM;
+	}
+
+	new_part->index = free_part->index;
+	new_part->header.signature = sig;
+	new_part->header.length = size;
+	memcpy(new_part->header.name, name, strnlen(name, sizeof(new_part->header.name)));
+	new_part->header.checksum = nvram_checksum(&new_part->header);
+
+	rc = nvram_write_header(new_part);
+	if (rc <= 0) {
+		pr_err("%s: nvram_write_header failed (%d)\n", __func__, rc);
+		kfree(new_part);
+		return rc;
+	}
+	list_add_tail(&new_part->partition, &free_part->partition);
+
+	/* Adjust or remove the partition we stole the space from */
+	if (free_part->header.length > size) {
+		free_part->index += size * NVRAM_BLOCK_LEN;
+		free_part->header.length -= size;
+		free_part->header.checksum = nvram_checksum(&free_part->header);
+		rc = nvram_write_header(free_part);
+		if (rc <= 0) {
+			pr_err("%s: nvram_write_header failed (%d)\n",
+			       __func__, rc);
+			return rc;
+		}
+	} else {
+		list_del(&free_part->partition);
+		kfree(free_part);
+	} 
+
+	/* Clear the new partition */
+	for (tmp_index = new_part->index + NVRAM_HEADER_LEN;
+	     tmp_index <  ((size - 1) * NVRAM_BLOCK_LEN);
+	     tmp_index += NVRAM_BLOCK_LEN) {
+		rc = ppc_md.nvram_write(nv_init_vals, NVRAM_BLOCK_LEN, &tmp_index);
+		if (rc <= 0) {
+			pr_err("%s: nvram_write failed (%d)\n",
+			       __func__, rc);
+			return rc;
+		}
+	}
+
+	return new_part->index + NVRAM_HEADER_LEN;
+}
+
+/**
+ * nvram_get_partition_size - Get the data size of an nvram partition
+ * @data_index: This is the offset of the start of the data of
+ *              the partition. The same value that is returned by
+ *              nvram_create_partition().
+ */
+int nvram_get_partition_size(loff_t data_index)
+{
+	struct nvram_partition *part;
+	
+	list_for_each_entry(part, &nvram_partitions, partition) {
+		if (part->index + NVRAM_HEADER_LEN == data_index)
+			return (part->header.length - 1) * NVRAM_BLOCK_LEN;
+	}
+	return -1;
+}
+
+
+/**
+ * nvram_find_partition - Find an nvram partition by signature and name
+ * @name: Name of the partition or NULL for any name
+ * @sig: Signature to test against
+ * @out_size: if non-NULL, returns the size of the data part of the partition
+ */
+loff_t nvram_find_partition(const char *name, int sig, int *out_size)
+{
+	struct nvram_partition *p;
+
+	list_for_each_entry(p, &nvram_partitions, partition) {
+		if (p->header.signature == sig &&
+		    (!name || !strncmp(p->header.name, name, 12))) {
+			if (out_size)
+				*out_size = (p->header.length - 1) *
+					NVRAM_BLOCK_LEN;
+			return p->index + NVRAM_HEADER_LEN;
+		}
+	}
+	return 0;
+}
+
+int __init nvram_scan_partitions(void)
+{
+	loff_t cur_index = 0;
+	struct nvram_header phead;
+	struct nvram_partition * tmp_part;
+	unsigned char c_sum;
+	char * header;
+	int total_size;
+	int err;
+
+	if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0)
+		return -ENODEV;
+	total_size = ppc_md.nvram_size();
+	
+	header = kmalloc(NVRAM_HEADER_LEN, GFP_KERNEL);
+	if (!header) {
+		printk(KERN_ERR "nvram_scan_partitions: Failed kmalloc\n");
+		return -ENOMEM;
+	}
+
+	while (cur_index < total_size) {
+
+		err = ppc_md.nvram_read(header, NVRAM_HEADER_LEN, &cur_index);
+		if (err != NVRAM_HEADER_LEN) {
+			printk(KERN_ERR "nvram_scan_partitions: Error parsing "
+			       "nvram partitions\n");
+			goto out;
+		}
+
+		cur_index -= NVRAM_HEADER_LEN; /* nvram_read will advance us */
+
+		memcpy(&phead, header, NVRAM_HEADER_LEN);
+
+		phead.length = be16_to_cpu(phead.length);
+
+		err = 0;
+		c_sum = nvram_checksum(&phead);
+		if (c_sum != phead.checksum) {
+			printk(KERN_WARNING "WARNING: nvram partition checksum"
+			       " was %02x, should be %02x!\n",
+			       phead.checksum, c_sum);
+			printk(KERN_WARNING "Terminating nvram partition scan\n");
+			goto out;
+		}
+		if (!phead.length) {
+			printk(KERN_WARNING "WARNING: nvram corruption "
+			       "detected: 0-length partition\n");
+			goto out;
+		}
+		tmp_part = kmalloc(sizeof(*tmp_part), GFP_KERNEL);
+		err = -ENOMEM;
+		if (!tmp_part) {
+			printk(KERN_ERR "nvram_scan_partitions: kmalloc failed\n");
+			goto out;
+		}
+		
+		memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN);
+		tmp_part->index = cur_index;
+		list_add_tail(&tmp_part->partition, &nvram_partitions);
+		
+		cur_index += phead.length * NVRAM_BLOCK_LEN;
+	}
+	err = 0;
+
+#ifdef DEBUG_NVRAM
+	nvram_print_partitions("NVRAM Partitions");
+#endif
+
+ out:
+	kfree(header);
+	return err;
+}
+
+static int __init nvram_init(void)
+{
+	int rc;
+	
+	BUILD_BUG_ON(NVRAM_BLOCK_LEN != 16);
+
+	if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0)
+		return  -ENODEV;
+
+  	rc = misc_register(&nvram_dev);
+	if (rc != 0) {
+		printk(KERN_ERR "nvram_init: failed to register device\n");
+		return rc;
+	}
+  	
+  	return rc;
+}
+device_initcall(nvram_init);
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
new file mode 100644
index 000000000..becaec990
--- /dev/null
+++ b/arch/powerpc/kernel/of_platform.c
@@ -0,0 +1,119 @@
+/*
+ *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *    and		 Arnd Bergmann, IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#undef DEBUG
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/mod_devicetable.h>
+#include <linux/pci.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/atomic.h>
+
+#include <asm/errno.h>
+#include <asm/topology.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/eeh.h>
+
+#ifdef CONFIG_PPC_OF_PLATFORM_PCI
+
+/* The probing of PCI controllers from of_platform is currently
+ * 64 bits only, mostly due to gratuitous differences between
+ * the 32 and 64 bits PCI code on PowerPC and the 32 bits one
+ * lacking some bits needed here.
+ */
+
+static int of_pci_phb_probe(struct platform_device *dev)
+{
+	struct pci_controller *phb;
+
+	/* Check if we can do that ... */
+	if (ppc_md.pci_setup_phb == NULL)
+		return -ENODEV;
+
+	pr_info("Setting up PCI bus %pOF\n", dev->dev.of_node);
+
+	/* Alloc and setup PHB data structure */
+	phb = pcibios_alloc_controller(dev->dev.of_node);
+	if (!phb)
+		return -ENODEV;
+
+	/* Setup parent in sysfs */
+	phb->parent = &dev->dev;
+
+	/* Setup the PHB using arch provided callback */
+	if (ppc_md.pci_setup_phb(phb)) {
+		pcibios_free_controller(phb);
+		return -ENODEV;
+	}
+
+	/* Process "ranges" property */
+	pci_process_bridge_OF_ranges(phb, dev->dev.of_node, 0);
+
+	/* Init pci_dn data structures */
+	pci_devs_phb_init_dynamic(phb);
+
+	/* Create EEH devices for the PHB */
+	eeh_dev_phb_init_dynamic(phb);
+
+	/* Register devices with EEH */
+	if (dev->dev.of_node->child)
+		eeh_add_device_tree_early(PCI_DN(dev->dev.of_node));
+
+	/* Scan the bus */
+	pcibios_scan_phb(phb);
+	if (phb->bus == NULL)
+		return -ENXIO;
+
+	/* Claim resources. This might need some rework as well depending
+	 * whether we are doing probe-only or not, like assigning unassigned
+	 * resources etc...
+	 */
+	pcibios_claim_one_bus(phb->bus);
+
+	/* Finish EEH setup */
+	eeh_add_device_tree_late(phb->bus);
+
+	/* Add probed PCI devices to the device model */
+	pci_bus_add_devices(phb->bus);
+
+	/* sysfs files should only be added after devices are added */
+	eeh_add_sysfs_files(phb->bus);
+
+	return 0;
+}
+
+static const struct of_device_id of_pci_phb_ids[] = {
+	{ .type = "pci", },
+	{ .type = "pcix", },
+	{ .type = "pcie", },
+	{ .type = "pciex", },
+	{ .type = "ht", },
+	{}
+};
+
+static struct platform_driver of_pci_phb_driver = {
+	.probe = of_pci_phb_probe,
+	.driver = {
+		.name = "of-pci",
+		.of_match_table = of_pci_phb_ids,
+	},
+};
+
+builtin_platform_driver(of_pci_phb_driver);
+
+#endif /* CONFIG_PPC_OF_PLATFORM_PCI */
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
new file mode 100644
index 000000000..8237884ca
--- /dev/null
+++ b/arch/powerpc/kernel/optprobes.c
@@ -0,0 +1,351 @@
+/*
+ * Code for Kernel probes Jump optimization.
+ *
+ * Copyright 2017, Anju T, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/jump_label.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <asm/kprobes.h>
+#include <asm/ptrace.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/sstep.h>
+#include <asm/ppc-opcode.h>
+
+#define TMPL_CALL_HDLR_IDX	\
+	(optprobe_template_call_handler - optprobe_template_entry)
+#define TMPL_EMULATE_IDX	\
+	(optprobe_template_call_emulate - optprobe_template_entry)
+#define TMPL_RET_IDX		\
+	(optprobe_template_ret - optprobe_template_entry)
+#define TMPL_OP_IDX		\
+	(optprobe_template_op_address - optprobe_template_entry)
+#define TMPL_INSN_IDX		\
+	(optprobe_template_insn - optprobe_template_entry)
+#define TMPL_END_IDX		\
+	(optprobe_template_end - optprobe_template_entry)
+
+DEFINE_INSN_CACHE_OPS(ppc_optinsn);
+
+static bool insn_page_in_use;
+
+static void *__ppc_alloc_insn_page(void)
+{
+	if (insn_page_in_use)
+		return NULL;
+	insn_page_in_use = true;
+	return &optinsn_slot;
+}
+
+static void __ppc_free_insn_page(void *page __maybe_unused)
+{
+	insn_page_in_use = false;
+}
+
+struct kprobe_insn_cache kprobe_ppc_optinsn_slots = {
+	.mutex = __MUTEX_INITIALIZER(kprobe_ppc_optinsn_slots.mutex),
+	.pages = LIST_HEAD_INIT(kprobe_ppc_optinsn_slots.pages),
+	/* insn_size initialized later */
+	.alloc = __ppc_alloc_insn_page,
+	.free = __ppc_free_insn_page,
+	.nr_garbage = 0,
+};
+
+/*
+ * Check if we can optimize this probe. Returns NIP post-emulation if this can
+ * be optimized and 0 otherwise.
+ */
+static unsigned long can_optimize(struct kprobe *p)
+{
+	struct pt_regs regs;
+	struct instruction_op op;
+	unsigned long nip = 0;
+
+	/*
+	 * kprobe placed for kretprobe during boot time
+	 * has a 'nop' instruction, which can be emulated.
+	 * So further checks can be skipped.
+	 */
+	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
+		return (unsigned long)p->addr + sizeof(kprobe_opcode_t);
+
+	/*
+	 * We only support optimizing kernel addresses, but not
+	 * module addresses.
+	 *
+	 * FIXME: Optimize kprobes placed in module addresses.
+	 */
+	if (!is_kernel_addr((unsigned long)p->addr))
+		return 0;
+
+	memset(&regs, 0, sizeof(struct pt_regs));
+	regs.nip = (unsigned long)p->addr;
+	regs.trap = 0x0;
+	regs.msr = MSR_KERNEL;
+
+	/*
+	 * Kprobe placed in conditional branch instructions are
+	 * not optimized, as we can't predict the nip prior with
+	 * dummy pt_regs and can not ensure that the return branch
+	 * from detour buffer falls in the range of address (i.e 32MB).
+	 * A branch back from trampoline is set up in the detour buffer
+	 * to the nip returned by the analyse_instr() here.
+	 *
+	 * Ensure that the instruction is not a conditional branch,
+	 * and that can be emulated.
+	 */
+	if (!is_conditional_branch(*p->ainsn.insn) &&
+			analyse_instr(&op, &regs, *p->ainsn.insn) == 1) {
+		emulate_update_regs(&regs, &op);
+		nip = regs.nip;
+	}
+
+	return nip;
+}
+
+static void optimized_callback(struct optimized_kprobe *op,
+			       struct pt_regs *regs)
+{
+	/* This is possible if op is under delayed unoptimizing */
+	if (kprobe_disabled(&op->kp))
+		return;
+
+	preempt_disable();
+
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(&op->kp);
+	} else {
+		__this_cpu_write(current_kprobe, &op->kp);
+		regs->nip = (unsigned long)op->kp.addr;
+		get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+		opt_pre_handler(&op->kp, regs);
+		__this_cpu_write(current_kprobe, NULL);
+	}
+
+	preempt_enable_no_resched();
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+	if (op->optinsn.insn) {
+		free_ppc_optinsn_slot(op->optinsn.insn, 1);
+		op->optinsn.insn = NULL;
+	}
+}
+
+/*
+ * emulate_step() requires insn to be emulated as
+ * second parameter. Load register 'r4' with the
+ * instruction.
+ */
+void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
+{
+	/* addis r4,0,(insn)@h */
+	patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) |
+			  ((val >> 16) & 0xffff));
+	addr++;
+
+	/* ori r4,r4,(insn)@l */
+	patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) |
+			  ___PPC_RS(4) | (val & 0xffff));
+}
+
+/*
+ * Generate instructions to load provided immediate 64-bit value
+ * to register 'r3' and patch these instructions at 'addr'.
+ */
+void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr)
+{
+	/* lis r3,(op)@highest */
+	patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) |
+			  ((val >> 48) & 0xffff));
+	addr++;
+
+	/* ori r3,r3,(op)@higher */
+	patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) |
+			  ___PPC_RS(3) | ((val >> 32) & 0xffff));
+	addr++;
+
+	/* rldicr r3,r3,32,31 */
+	patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) |
+			  ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31));
+	addr++;
+
+	/* oris r3,r3,(op)@h */
+	patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) |
+			  ___PPC_RS(3) | ((val >> 16) & 0xffff));
+	addr++;
+
+	/* ori r3,r3,(op)@l */
+	patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) |
+			  ___PPC_RS(3) | (val & 0xffff));
+}
+
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+	kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step;
+	kprobe_opcode_t *op_callback_addr, *emulate_step_addr;
+	long b_offset;
+	unsigned long nip, size;
+	int rc, i;
+
+	kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+
+	nip = can_optimize(p);
+	if (!nip)
+		return -EILSEQ;
+
+	/* Allocate instruction slot for detour buffer */
+	buff = get_ppc_optinsn_slot();
+	if (!buff)
+		return -ENOMEM;
+
+	/*
+	 * OPTPROBE uses 'b' instruction to branch to optinsn.insn.
+	 *
+	 * The target address has to be relatively nearby, to permit use
+	 * of branch instruction in powerpc, because the address is specified
+	 * in an immediate field in the instruction opcode itself, ie 24 bits
+	 * in the opcode specify the address. Therefore the address should
+	 * be within 32MB on either side of the current instruction.
+	 */
+	b_offset = (unsigned long)buff - (unsigned long)p->addr;
+	if (!is_offset_in_branch_range(b_offset))
+		goto error;
+
+	/* Check if the return address is also within 32MB range */
+	b_offset = (unsigned long)(buff + TMPL_RET_IDX) -
+			(unsigned long)nip;
+	if (!is_offset_in_branch_range(b_offset))
+		goto error;
+
+	/* Setup template */
+	/* We can optimize this via patch_instruction_window later */
+	size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int);
+	pr_devel("Copying template to %p, size %lu\n", buff, size);
+	for (i = 0; i < size; i++) {
+		rc = patch_instruction(buff + i, *(optprobe_template_entry + i));
+		if (rc < 0)
+			goto error;
+	}
+
+	/*
+	 * Fixup the template with instructions to:
+	 * 1. load the address of the actual probepoint
+	 */
+	patch_imm64_load_insns((unsigned long)op, buff + TMPL_OP_IDX);
+
+	/*
+	 * 2. branch to optimized_callback() and emulate_step()
+	 */
+	op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback");
+	emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step");
+	if (!op_callback_addr || !emulate_step_addr) {
+		WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n");
+		goto error;
+	}
+
+	branch_op_callback = create_branch((unsigned int *)buff + TMPL_CALL_HDLR_IDX,
+				(unsigned long)op_callback_addr,
+				BRANCH_SET_LINK);
+
+	branch_emulate_step = create_branch((unsigned int *)buff + TMPL_EMULATE_IDX,
+				(unsigned long)emulate_step_addr,
+				BRANCH_SET_LINK);
+
+	if (!branch_op_callback || !branch_emulate_step)
+		goto error;
+
+	patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback);
+	patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step);
+
+	/*
+	 * 3. load instruction to be emulated into relevant register, and
+	 */
+	patch_imm32_load_insns(*p->ainsn.insn, buff + TMPL_INSN_IDX);
+
+	/*
+	 * 4. branch back from trampoline
+	 */
+	patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0);
+
+	flush_icache_range((unsigned long)buff,
+			   (unsigned long)(&buff[TMPL_END_IDX]));
+
+	op->optinsn.insn = buff;
+
+	return 0;
+
+error:
+	free_ppc_optinsn_slot(buff, 0);
+	return -ERANGE;
+
+}
+
+int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+	return optinsn->insn != NULL;
+}
+
+/*
+ * On powerpc, Optprobes always replaces one instruction (4 bytes
+ * aligned and 4 bytes long). It is impossible to encounter another
+ * kprobe in this address range. So always return 0.
+ */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+	return 0;
+}
+
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+	struct optimized_kprobe *op;
+	struct optimized_kprobe *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		/*
+		 * Backup instructions which will be replaced
+		 * by jump address
+		 */
+		memcpy(op->optinsn.copied_insn, op->kp.addr,
+					       RELATIVEJUMP_SIZE);
+		patch_instruction(op->kp.addr,
+			create_branch((unsigned int *)op->kp.addr,
+				      (unsigned long)op->optinsn.insn, 0));
+		list_del_init(&op->list);
+	}
+}
+
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+	arch_arm_kprobe(&op->kp);
+}
+
+void arch_unoptimize_kprobes(struct list_head *oplist,
+			     struct list_head *done_list)
+{
+	struct optimized_kprobe *op;
+	struct optimized_kprobe *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		arch_unoptimize_kprobe(op);
+		list_move(&op->list, done_list);
+	}
+}
+
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+				 unsigned long addr)
+{
+	return ((unsigned long)op->kp.addr <= addr &&
+		(unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr);
+}
diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S
new file mode 100644
index 000000000..98a3aeeb3
--- /dev/null
+++ b/arch/powerpc/kernel/optprobes_head.S
@@ -0,0 +1,134 @@
+/*
+ * Code to prepare detour buffer for optprobes in Kernel.
+ *
+ * Copyright 2017, Anju T, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/ptrace.h>
+#include <asm/asm-offsets.h>
+
+#define	OPT_SLOT_SIZE	65536
+
+	.balign	4
+
+	/*
+	 * Reserve an area to allocate slots for detour buffer.
+	 * This is part of .text section (rather than vmalloc area)
+	 * as this needs to be within 32MB of the probed address.
+	 */
+	.global optinsn_slot
+optinsn_slot:
+	.space	OPT_SLOT_SIZE
+
+	/*
+	 * Optprobe template:
+	 * This template gets copied into one of the slots in optinsn_slot
+	 * and gets fixed up with real optprobe structures et al.
+	 */
+	.global optprobe_template_entry
+optprobe_template_entry:
+	/* Create an in-memory pt_regs */
+	stdu	r1,-INT_FRAME_SIZE(r1)
+	SAVE_GPR(0,r1)
+	/* Save the previous SP into stack */
+	addi	r0,r1,INT_FRAME_SIZE
+	std	r0,GPR1(r1)
+	SAVE_10GPRS(2,r1)
+	SAVE_10GPRS(12,r1)
+	SAVE_10GPRS(22,r1)
+	/* Save SPRS */
+	mfmsr	r5
+	std	r5,_MSR(r1)
+	li	r5,0x700
+	std	r5,_TRAP(r1)
+	li	r5,0
+	std	r5,ORIG_GPR3(r1)
+	std	r5,RESULT(r1)
+	mfctr	r5
+	std	r5,_CTR(r1)
+	mflr	r5
+	std	r5,_LINK(r1)
+	mfspr	r5,SPRN_XER
+	std	r5,_XER(r1)
+	mfcr	r5
+	std	r5,_CCR(r1)
+	lbz     r5,PACAIRQSOFTMASK(r13)
+	std     r5,SOFTE(r1)
+
+	/*
+	 * We may get here from a module, so load the kernel TOC in r2.
+	 * The original TOC gets restored when pt_regs is restored
+	 * further below.
+	 */
+	ld	r2,PACATOC(r13)
+
+	.global optprobe_template_op_address
+optprobe_template_op_address:
+	/*
+	 * Parameters to optimized_callback():
+	 * 1. optimized_kprobe structure in r3
+	 */
+	nop
+	nop
+	nop
+	nop
+	nop
+	/* 2. pt_regs pointer in r4 */
+	addi	r4,r1,STACK_FRAME_OVERHEAD
+
+	.global optprobe_template_call_handler
+optprobe_template_call_handler:
+	/* Branch to optimized_callback() */
+	nop
+
+	/*
+	 * Parameters for instruction emulation:
+	 * 1. Pass SP in register r3.
+	 */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+
+	.global optprobe_template_insn
+optprobe_template_insn:
+	/* 2, Pass instruction to be emulated in r4 */
+	nop
+	nop
+
+	.global optprobe_template_call_emulate
+optprobe_template_call_emulate:
+	/* Branch to emulate_step()  */
+	nop
+
+	/*
+	 * All done.
+	 * Now, restore the registers...
+	 */
+	ld	r5,_MSR(r1)
+	mtmsr	r5
+	ld	r5,_CTR(r1)
+	mtctr	r5
+	ld	r5,_LINK(r1)
+	mtlr	r5
+	ld	r5,_XER(r1)
+	mtxer	r5
+	ld	r5,_CCR(r1)
+	mtcr	r5
+	REST_GPR(0,r1)
+	REST_10GPRS(2,r1)
+	REST_10GPRS(12,r1)
+	REST_10GPRS(22,r1)
+	/* Restore the previous SP */
+	addi	r1,r1,INT_FRAME_SIZE
+
+	.global optprobe_template_ret
+optprobe_template_ret:
+	/* ... and jump back from trampoline */
+	nop
+
+	.global optprobe_template_end
+optprobe_template_end:
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
new file mode 100644
index 000000000..0ee3e6d50
--- /dev/null
+++ b/arch/powerpc/kernel/paca.c
@@ -0,0 +1,282 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/smp.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/sched/task.h>
+
+#include <asm/lppaca.h>
+#include <asm/paca.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/kexec.h>
+
+#include "setup.h"
+
+#ifndef CONFIG_SMP
+#define boot_cpuid 0
+#endif
+
+static void *__init alloc_paca_data(unsigned long size, unsigned long align,
+				unsigned long limit, int cpu)
+{
+	unsigned long pa;
+	int nid;
+
+	/*
+	 * boot_cpuid paca is allocated very early before cpu_to_node is up.
+	 * Set bottom-up mode, because the boot CPU should be on node-0,
+	 * which will put its paca in the right place.
+	 */
+	if (cpu == boot_cpuid) {
+		nid = -1;
+		memblock_set_bottom_up(true);
+	} else {
+		nid = early_cpu_to_node(cpu);
+	}
+
+	pa = memblock_alloc_base_nid(size, align, limit, nid, MEMBLOCK_NONE);
+	if (!pa) {
+		pa = memblock_alloc_base(size, align, limit);
+		if (!pa)
+			panic("cannot allocate paca data");
+	}
+
+	if (cpu == boot_cpuid)
+		memblock_set_bottom_up(false);
+
+	return __va(pa);
+}
+
+#ifdef CONFIG_PPC_PSERIES
+
+/*
+ * See asm/lppaca.h for more detail.
+ *
+ * lppaca structures must must be 1kB in size, L1 cache line aligned,
+ * and not cross 4kB boundary. A 1kB size and 1kB alignment will satisfy
+ * these requirements.
+ */
+static inline void init_lppaca(struct lppaca *lppaca)
+{
+	BUILD_BUG_ON(sizeof(struct lppaca) != 640);
+
+	*lppaca = (struct lppaca) {
+		.desc = cpu_to_be32(0xd397d781),	/* "LpPa" */
+		.size = cpu_to_be16(0x400),
+		.fpregs_in_use = 1,
+		.slb_count = cpu_to_be16(64),
+		.vmxregs_in_use = 0,
+		.page_ins = 0, };
+};
+
+static struct lppaca * __init new_lppaca(int cpu, unsigned long limit)
+{
+	struct lppaca *lp;
+	size_t size = 0x400;
+
+	BUILD_BUG_ON(size < sizeof(struct lppaca));
+
+	if (early_cpu_has_feature(CPU_FTR_HVMODE))
+		return NULL;
+
+	lp = alloc_paca_data(size, 0x400, limit, cpu);
+	init_lppaca(lp);
+
+	return lp;
+}
+#endif /* CONFIG_PPC_BOOK3S */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+/*
+ * 3 persistent SLBs are allocated here.  The buffer will be zero
+ * initially, hence will all be invaild until we actually write them.
+ *
+ * If you make the number of persistent SLB entries dynamic, please also
+ * update PR KVM to flush and restore them accordingly.
+ */
+static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit)
+{
+	struct slb_shadow *s;
+
+	if (cpu != boot_cpuid) {
+		/*
+		 * Boot CPU comes here before early_radix_enabled
+		 * is parsed (e.g., for disable_radix). So allocate
+		 * always and this will be fixed up in free_unused_pacas.
+		 */
+		if (early_radix_enabled())
+			return NULL;
+	}
+
+	s = alloc_paca_data(sizeof(*s), L1_CACHE_BYTES, limit, cpu);
+	memset(s, 0, sizeof(*s));
+
+	s->persistent = cpu_to_be32(SLB_NUM_BOLTED);
+	s->buffer_length = cpu_to_be32(sizeof(*s));
+
+	return s;
+}
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+/* The Paca is an array with one entry per processor.  Each contains an
+ * lppaca, which contains the information shared between the
+ * hypervisor and Linux.
+ * On systems with hardware multi-threading, there are two threads
+ * per processor.  The Paca array must contain an entry for each thread.
+ * The VPD Areas will give a max logical processors = 2 * max physical
+ * processors.  The processor VPD array needs one entry per physical
+ * processor (not thread).
+ */
+struct paca_struct **paca_ptrs __read_mostly;
+EXPORT_SYMBOL(paca_ptrs);
+
+void __init initialise_paca(struct paca_struct *new_paca, int cpu)
+{
+#ifdef CONFIG_PPC_PSERIES
+	new_paca->lppaca_ptr = NULL;
+#endif
+#ifdef CONFIG_PPC_BOOK3E
+	new_paca->kernel_pgd = swapper_pg_dir;
+#endif
+	new_paca->lock_token = 0x8000;
+	new_paca->paca_index = cpu;
+	new_paca->kernel_toc = kernel_toc_addr();
+	new_paca->kernelbase = (unsigned long) _stext;
+	/* Only set MSR:IR/DR when MMU is initialized */
+	new_paca->kernel_msr = MSR_KERNEL & ~(MSR_IR | MSR_DR);
+	new_paca->hw_cpu_id = 0xffff;
+	new_paca->kexec_state = KEXEC_STATE_NONE;
+	new_paca->__current = &init_task;
+	new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
+#ifdef CONFIG_PPC_BOOK3S_64
+	new_paca->slb_shadow_ptr = NULL;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* For now -- if we have threads this will be adjusted later */
+	new_paca->tcd_ptr = &new_paca->tcd;
+#endif
+}
+
+/* Put the paca pointer into r13 and SPRG_PACA */
+void setup_paca(struct paca_struct *new_paca)
+{
+	/* Setup r13 */
+	local_paca = new_paca;
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* On Book3E, initialize the TLB miss exception frames */
+	mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb);
+#else
+	/* In HV mode, we setup both HPACA and PACA to avoid problems
+	 * if we do a GET_PACA() before the feature fixups have been
+	 * applied
+	 */
+	if (early_cpu_has_feature(CPU_FTR_HVMODE))
+		mtspr(SPRN_SPRG_HPACA, local_paca);
+#endif
+	mtspr(SPRN_SPRG_PACA, local_paca);
+
+}
+
+static int __initdata paca_nr_cpu_ids;
+static int __initdata paca_ptrs_size;
+static int __initdata paca_struct_size;
+
+void __init allocate_paca_ptrs(void)
+{
+	paca_nr_cpu_ids = nr_cpu_ids;
+
+	paca_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
+	paca_ptrs = __va(memblock_alloc(paca_ptrs_size, 0));
+	memset(paca_ptrs, 0x88, paca_ptrs_size);
+}
+
+void __init allocate_paca(int cpu)
+{
+	u64 limit;
+	struct paca_struct *paca;
+
+	BUG_ON(cpu >= paca_nr_cpu_ids);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * We access pacas in real mode, and cannot take SLB faults
+	 * on them when in virtual mode, so allocate them accordingly.
+	 */
+	limit = min(ppc64_bolted_size(), ppc64_rma_size);
+#else
+	limit = ppc64_rma_size;
+#endif
+
+	paca = alloc_paca_data(sizeof(struct paca_struct), L1_CACHE_BYTES,
+				limit, cpu);
+	paca_ptrs[cpu] = paca;
+	memset(paca, 0, sizeof(struct paca_struct));
+
+	initialise_paca(paca, cpu);
+#ifdef CONFIG_PPC_PSERIES
+	paca->lppaca_ptr = new_lppaca(cpu, limit);
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	paca->slb_shadow_ptr = new_slb_shadow(cpu, limit);
+#endif
+	paca_struct_size += sizeof(struct paca_struct);
+}
+
+void __init free_unused_pacas(void)
+{
+	int new_ptrs_size;
+
+	new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
+	if (new_ptrs_size < paca_ptrs_size)
+		memblock_free(__pa(paca_ptrs) + new_ptrs_size,
+					paca_ptrs_size - new_ptrs_size);
+
+	paca_nr_cpu_ids = nr_cpu_ids;
+	paca_ptrs_size = new_ptrs_size;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (early_radix_enabled()) {
+		/* Ugly fixup, see new_slb_shadow() */
+		memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
+				sizeof(struct slb_shadow));
+		paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL;
+	}
+#endif
+
+	printk(KERN_DEBUG "Allocated %u bytes for %u pacas\n",
+			paca_ptrs_size + paca_struct_size, nr_cpu_ids);
+}
+
+void copy_mm_to_paca(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_BOOK3S
+	mm_context_t *context = &mm->context;
+
+	get_paca()->mm_ctx_id = context->id;
+#ifdef CONFIG_PPC_MM_SLICES
+	VM_BUG_ON(!mm->context.slb_addr_limit);
+	get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit;
+	memcpy(&get_paca()->mm_ctx_low_slices_psize,
+	       &context->low_slices_psize, sizeof(context->low_slices_psize));
+	memcpy(&get_paca()->mm_ctx_high_slices_psize,
+	       &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
+#else /* CONFIG_PPC_MM_SLICES */
+	get_paca()->mm_ctx_user_psize = context->user_psize;
+	get_paca()->mm_ctx_sllp = context->sllp;
+#endif
+#else /* !CONFIG_PPC_BOOK3S */
+	return;
+#endif
+}
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
new file mode 100644
index 000000000..74628aca2
--- /dev/null
+++ b/arch/powerpc/kernel/pci-common.c
@@ -0,0 +1,1683 @@
+/*
+ * Contains common pci routines for ALL ppc platform
+ * (based on pci_32.c and pci_64.c)
+ *
+ * Port for PPC64 David Engebretsen, IBM Corp.
+ * Contains common pci routines for ppc64 platform, pSeries and iSeries brands.
+ *
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *   Rework, based on alpha PCI code.
+ *
+ * Common pmac/prep/chrp pci routines. -- Cort
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/mm.h>
+#include <linux/shmem_fs.h>
+#include <linux/list.h>
+#include <linux/syscalls.h>
+#include <linux/irq.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/vgaarb.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/byteorder.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/eeh.h>
+
+#include "../../../drivers/pci/pci.h"
+
+/* hose_spinlock protects accesses to the the phb_bitmap. */
+static DEFINE_SPINLOCK(hose_spinlock);
+LIST_HEAD(hose_list);
+
+/* For dynamic PHB numbering on get_phb_number(): max number of PHBs. */
+#define MAX_PHBS 0x10000
+
+/*
+ * For dynamic PHB numbering: used/free PHBs tracking bitmap.
+ * Accesses to this bitmap should be protected by hose_spinlock.
+ */
+static DECLARE_BITMAP(phb_bitmap, MAX_PHBS);
+
+/* ISA Memory physical address */
+resource_size_t isa_mem_base;
+EXPORT_SYMBOL(isa_mem_base);
+
+
+static const struct dma_map_ops *pci_dma_ops = &dma_nommu_ops;
+
+void set_pci_dma_ops(const struct dma_map_ops *dma_ops)
+{
+	pci_dma_ops = dma_ops;
+}
+
+const struct dma_map_ops *get_pci_dma_ops(void)
+{
+	return pci_dma_ops;
+}
+EXPORT_SYMBOL(get_pci_dma_ops);
+
+/*
+ * This function should run under locking protection, specifically
+ * hose_spinlock.
+ */
+static int get_phb_number(struct device_node *dn)
+{
+	int ret, phb_id = -1;
+	u32 prop_32;
+	u64 prop;
+
+	/*
+	 * Try fixed PHB numbering first, by checking archs and reading
+	 * the respective device-tree properties. Firstly, try powernv by
+	 * reading "ibm,opal-phbid", only present in OPAL environment.
+	 */
+	ret = of_property_read_u64(dn, "ibm,opal-phbid", &prop);
+	if (ret) {
+		ret = of_property_read_u32_index(dn, "reg", 1, &prop_32);
+		prop = prop_32;
+	}
+
+	if (!ret)
+		phb_id = (int)(prop & (MAX_PHBS - 1));
+
+	/* We need to be sure to not use the same PHB number twice. */
+	if ((phb_id >= 0) && !test_and_set_bit(phb_id, phb_bitmap))
+		return phb_id;
+
+	/*
+	 * If not pseries nor powernv, or if fixed PHB numbering tried to add
+	 * the same PHB number twice, then fallback to dynamic PHB numbering.
+	 */
+	phb_id = find_first_zero_bit(phb_bitmap, MAX_PHBS);
+	BUG_ON(phb_id >= MAX_PHBS);
+	set_bit(phb_id, phb_bitmap);
+
+	return phb_id;
+}
+
+struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
+{
+	struct pci_controller *phb;
+
+	phb = zalloc_maybe_bootmem(sizeof(struct pci_controller), GFP_KERNEL);
+	if (phb == NULL)
+		return NULL;
+	spin_lock(&hose_spinlock);
+	phb->global_number = get_phb_number(dev);
+	list_add_tail(&phb->list_node, &hose_list);
+	spin_unlock(&hose_spinlock);
+	phb->dn = dev;
+	phb->is_dynamic = slab_is_available();
+#ifdef CONFIG_PPC64
+	if (dev) {
+		int nid = of_node_to_nid(dev);
+
+		if (nid < 0 || !node_online(nid))
+			nid = -1;
+
+		PHB_SET_NODE(phb, nid);
+	}
+#endif
+	return phb;
+}
+EXPORT_SYMBOL_GPL(pcibios_alloc_controller);
+
+void pcibios_free_controller(struct pci_controller *phb)
+{
+	spin_lock(&hose_spinlock);
+
+	/* Clear bit of phb_bitmap to allow reuse of this PHB number. */
+	if (phb->global_number < MAX_PHBS)
+		clear_bit(phb->global_number, phb_bitmap);
+
+	list_del(&phb->list_node);
+	spin_unlock(&hose_spinlock);
+
+	if (phb->is_dynamic)
+		kfree(phb);
+}
+EXPORT_SYMBOL_GPL(pcibios_free_controller);
+
+/*
+ * This function is used to call pcibios_free_controller()
+ * in a deferred manner: a callback from the PCI subsystem.
+ *
+ * _*DO NOT*_ call pcibios_free_controller() explicitly if
+ * this is used (or it may access an invalid *phb pointer).
+ *
+ * The callback occurs when all references to the root bus
+ * are dropped (e.g., child buses/devices and their users).
+ *
+ * It's called as .release_fn() of 'struct pci_host_bridge'
+ * which is associated with the 'struct pci_controller.bus'
+ * (root bus) - it expects .release_data to hold a pointer
+ * to 'struct pci_controller'.
+ *
+ * In order to use it, register .release_fn()/release_data
+ * like this:
+ *
+ * pci_set_host_bridge_release(bridge,
+ *                             pcibios_free_controller_deferred
+ *                             (void *) phb);
+ *
+ * e.g. in the pcibios_root_bridge_prepare() callback from
+ * pci_create_root_bus().
+ */
+void pcibios_free_controller_deferred(struct pci_host_bridge *bridge)
+{
+	struct pci_controller *phb = (struct pci_controller *)
+					 bridge->release_data;
+
+	pr_debug("domain %d, dynamic %d\n", phb->global_number, phb->is_dynamic);
+
+	pcibios_free_controller(phb);
+}
+EXPORT_SYMBOL_GPL(pcibios_free_controller_deferred);
+
+/*
+ * The function is used to return the minimal alignment
+ * for memory or I/O windows of the associated P2P bridge.
+ * By default, 4KiB alignment for I/O windows and 1MiB for
+ * memory windows.
+ */
+resource_size_t pcibios_window_alignment(struct pci_bus *bus,
+					 unsigned long type)
+{
+	struct pci_controller *phb = pci_bus_to_host(bus);
+
+	if (phb->controller_ops.window_alignment)
+		return phb->controller_ops.window_alignment(bus, type);
+
+	/*
+	 * PCI core will figure out the default
+	 * alignment: 4KiB for I/O and 1MiB for
+	 * memory window.
+	 */
+	return 1;
+}
+
+void pcibios_setup_bridge(struct pci_bus *bus, unsigned long type)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+
+	if (hose->controller_ops.setup_bridge)
+		hose->controller_ops.setup_bridge(bus, type);
+}
+
+void pcibios_reset_secondary_bus(struct pci_dev *dev)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (phb->controller_ops.reset_secondary_bus) {
+		phb->controller_ops.reset_secondary_bus(dev);
+		return;
+	}
+
+	pci_reset_secondary_bus(dev);
+}
+
+resource_size_t pcibios_default_alignment(void)
+{
+	if (ppc_md.pcibios_default_alignment)
+		return ppc_md.pcibios_default_alignment();
+
+	return 0;
+}
+
+#ifdef CONFIG_PCI_IOV
+resource_size_t pcibios_iov_resource_alignment(struct pci_dev *pdev, int resno)
+{
+	if (ppc_md.pcibios_iov_resource_alignment)
+		return ppc_md.pcibios_iov_resource_alignment(pdev, resno);
+
+	return pci_iov_resource_size(pdev, resno);
+}
+
+int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	if (ppc_md.pcibios_sriov_enable)
+		return ppc_md.pcibios_sriov_enable(pdev, num_vfs);
+
+	return 0;
+}
+
+int pcibios_sriov_disable(struct pci_dev *pdev)
+{
+	if (ppc_md.pcibios_sriov_disable)
+		return ppc_md.pcibios_sriov_disable(pdev);
+
+	return 0;
+}
+
+#endif /* CONFIG_PCI_IOV */
+
+void pcibios_bus_add_device(struct pci_dev *pdev)
+{
+	if (ppc_md.pcibios_bus_add_device)
+		ppc_md.pcibios_bus_add_device(pdev);
+}
+
+static resource_size_t pcibios_io_size(const struct pci_controller *hose)
+{
+#ifdef CONFIG_PPC64
+	return hose->pci_io_size;
+#else
+	return resource_size(&hose->io_resource);
+#endif
+}
+
+int pcibios_vaddr_is_ioport(void __iomem *address)
+{
+	int ret = 0;
+	struct pci_controller *hose;
+	resource_size_t size;
+
+	spin_lock(&hose_spinlock);
+	list_for_each_entry(hose, &hose_list, list_node) {
+		size = pcibios_io_size(hose);
+		if (address >= hose->io_base_virt &&
+		    address < (hose->io_base_virt + size)) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&hose_spinlock);
+	return ret;
+}
+
+unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	struct pci_controller *hose;
+	resource_size_t size;
+	unsigned long ret = ~0;
+
+	spin_lock(&hose_spinlock);
+	list_for_each_entry(hose, &hose_list, list_node) {
+		size = pcibios_io_size(hose);
+		if (address >= hose->io_base_phys &&
+		    address < (hose->io_base_phys + size)) {
+			unsigned long base =
+				(unsigned long)hose->io_base_virt - _IO_BASE;
+			ret = base + (address - hose->io_base_phys);
+			break;
+		}
+	}
+	spin_unlock(&hose_spinlock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_address_to_pio);
+
+/*
+ * Return the domain number for this bus.
+ */
+int pci_domain_nr(struct pci_bus *bus)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+
+	return hose->global_number;
+}
+EXPORT_SYMBOL(pci_domain_nr);
+
+/* This routine is meant to be used early during boot, when the
+ * PCI bus numbers have not yet been assigned, and you need to
+ * issue PCI config cycles to an OF device.
+ * It could also be used to "fix" RTAS config cycles if you want
+ * to set pci_assign_all_buses to 1 and still use RTAS for PCI
+ * config cycles.
+ */
+struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node)
+{
+	while(node) {
+		struct pci_controller *hose, *tmp;
+		list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+			if (hose->dn == node)
+				return hose;
+		node = node->parent;
+	}
+	return NULL;
+}
+
+/*
+ * Reads the interrupt pin to determine if interrupt is use by card.
+ * If the interrupt is used, then gets the interrupt line from the
+ * openfirmware and sets it in the pci_dev and pci_config line.
+ */
+static int pci_read_irq_line(struct pci_dev *pci_dev)
+{
+	int virq;
+
+	pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev));
+
+	/* Try to get a mapping from the device-tree */
+	virq = of_irq_parse_and_map_pci(pci_dev, 0, 0);
+	if (virq <= 0) {
+		u8 line, pin;
+
+		/* If that fails, lets fallback to what is in the config
+		 * space and map that through the default controller. We
+		 * also set the type to level low since that's what PCI
+		 * interrupts are. If your platform does differently, then
+		 * either provide a proper interrupt tree or don't use this
+		 * function.
+		 */
+		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &pin))
+			return -1;
+		if (pin == 0)
+			return -1;
+		if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_LINE, &line) ||
+		    line == 0xff || line == 0) {
+			return -1;
+		}
+		pr_debug(" No map ! Using line %d (pin %d) from PCI config\n",
+			 line, pin);
+
+		virq = irq_create_mapping(NULL, line);
+		if (virq)
+			irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW);
+	}
+
+	if (!virq) {
+		pr_debug(" Failed to map !\n");
+		return -1;
+	}
+
+	pr_debug(" Mapped to linux irq %d\n", virq);
+
+	pci_dev->irq = virq;
+
+	return 0;
+}
+
+/*
+ * Platform support for /proc/bus/pci/X/Y mmap()s.
+ *  -- paulus.
+ */
+int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	resource_size_t ioaddr = pci_resource_start(pdev, bar);
+
+	if (!hose)
+		return -EINVAL;
+
+	/* Convert to an offset within this PCI controller */
+	ioaddr -= (unsigned long)hose->io_base_virt - _IO_BASE;
+
+	vma->vm_pgoff += (ioaddr + hose->io_base_phys) >> PAGE_SHIFT;
+	return 0;
+}
+
+/*
+ * This one is used by /dev/mem and fbdev who have no clue about the
+ * PCI device, it tries to find the PCI device first and calls the
+ * above routine
+ */
+pgprot_t pci_phys_mem_access_prot(struct file *file,
+				  unsigned long pfn,
+				  unsigned long size,
+				  pgprot_t prot)
+{
+	struct pci_dev *pdev = NULL;
+	struct resource *found = NULL;
+	resource_size_t offset = ((resource_size_t)pfn) << PAGE_SHIFT;
+	int i;
+
+	if (page_is_ram(pfn))
+		return prot;
+
+	prot = pgprot_noncached(prot);
+	for_each_pci_dev(pdev) {
+		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
+			struct resource *rp = &pdev->resource[i];
+			int flags = rp->flags;
+
+			/* Active and same type? */
+			if ((flags & IORESOURCE_MEM) == 0)
+				continue;
+			/* In the range of this resource? */
+			if (offset < (rp->start & PAGE_MASK) ||
+			    offset > rp->end)
+				continue;
+			found = rp;
+			break;
+		}
+		if (found)
+			break;
+	}
+	if (found) {
+		if (found->flags & IORESOURCE_PREFETCH)
+			prot = pgprot_noncached_wc(prot);
+		pci_dev_put(pdev);
+	}
+
+	pr_debug("PCI: Non-PCI map for %llx, prot: %lx\n",
+		 (unsigned long long)offset, pgprot_val(prot));
+
+	return prot;
+}
+
+/* This provides legacy IO read access on a bus */
+int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size)
+{
+	unsigned long offset;
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct resource *rp = &hose->io_resource;
+	void __iomem *addr;
+
+	/* Check if port can be supported by that bus. We only check
+	 * the ranges of the PHB though, not the bus itself as the rules
+	 * for forwarding legacy cycles down bridges are not our problem
+	 * here. So if the host bridge supports it, we do it.
+	 */
+	offset = (unsigned long)hose->io_base_virt - _IO_BASE;
+	offset += port;
+
+	if (!(rp->flags & IORESOURCE_IO))
+		return -ENXIO;
+	if (offset < rp->start || (offset + size) > rp->end)
+		return -ENXIO;
+	addr = hose->io_base_virt + port;
+
+	switch(size) {
+	case 1:
+		*((u8 *)val) = in_8(addr);
+		return 1;
+	case 2:
+		if (port & 1)
+			return -EINVAL;
+		*((u16 *)val) = in_le16(addr);
+		return 2;
+	case 4:
+		if (port & 3)
+			return -EINVAL;
+		*((u32 *)val) = in_le32(addr);
+		return 4;
+	}
+	return -EINVAL;
+}
+
+/* This provides legacy IO write access on a bus */
+int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, size_t size)
+{
+	unsigned long offset;
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct resource *rp = &hose->io_resource;
+	void __iomem *addr;
+
+	/* Check if port can be supported by that bus. We only check
+	 * the ranges of the PHB though, not the bus itself as the rules
+	 * for forwarding legacy cycles down bridges are not our problem
+	 * here. So if the host bridge supports it, we do it.
+	 */
+	offset = (unsigned long)hose->io_base_virt - _IO_BASE;
+	offset += port;
+
+	if (!(rp->flags & IORESOURCE_IO))
+		return -ENXIO;
+	if (offset < rp->start || (offset + size) > rp->end)
+		return -ENXIO;
+	addr = hose->io_base_virt + port;
+
+	/* WARNING: The generic code is idiotic. It gets passed a pointer
+	 * to what can be a 1, 2 or 4 byte quantity and always reads that
+	 * as a u32, which means that we have to correct the location of
+	 * the data read within those 32 bits for size 1 and 2
+	 */
+	switch(size) {
+	case 1:
+		out_8(addr, val >> 24);
+		return 1;
+	case 2:
+		if (port & 1)
+			return -EINVAL;
+		out_le16(addr, val >> 16);
+		return 2;
+	case 4:
+		if (port & 3)
+			return -EINVAL;
+		out_le32(addr, val);
+		return 4;
+	}
+	return -EINVAL;
+}
+
+/* This provides legacy IO or memory mmap access on a bus */
+int pci_mmap_legacy_page_range(struct pci_bus *bus,
+			       struct vm_area_struct *vma,
+			       enum pci_mmap_state mmap_state)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	resource_size_t offset =
+		((resource_size_t)vma->vm_pgoff) << PAGE_SHIFT;
+	resource_size_t size = vma->vm_end - vma->vm_start;
+	struct resource *rp;
+
+	pr_debug("pci_mmap_legacy_page_range(%04x:%02x, %s @%llx..%llx)\n",
+		 pci_domain_nr(bus), bus->number,
+		 mmap_state == pci_mmap_mem ? "MEM" : "IO",
+		 (unsigned long long)offset,
+		 (unsigned long long)(offset + size - 1));
+
+	if (mmap_state == pci_mmap_mem) {
+		/* Hack alert !
+		 *
+		 * Because X is lame and can fail starting if it gets an error trying
+		 * to mmap legacy_mem (instead of just moving on without legacy memory
+		 * access) we fake it here by giving it anonymous memory, effectively
+		 * behaving just like /dev/zero
+		 */
+		if ((offset + size) > hose->isa_mem_size) {
+			printk(KERN_DEBUG
+			       "Process %s (pid:%d) mapped non-existing PCI legacy memory for 0%04x:%02x\n",
+			       current->comm, current->pid, pci_domain_nr(bus), bus->number);
+			if (vma->vm_flags & VM_SHARED)
+				return shmem_zero_setup(vma);
+			return 0;
+		}
+		offset += hose->isa_mem_phys;
+	} else {
+		unsigned long io_offset = (unsigned long)hose->io_base_virt - _IO_BASE;
+		unsigned long roffset = offset + io_offset;
+		rp = &hose->io_resource;
+		if (!(rp->flags & IORESOURCE_IO))
+			return -ENXIO;
+		if (roffset < rp->start || (roffset + size) > rp->end)
+			return -ENXIO;
+		offset += hose->io_base_phys;
+	}
+	pr_debug(" -> mapping phys %llx\n", (unsigned long long)offset);
+
+	vma->vm_pgoff = offset >> PAGE_SHIFT;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+			  const struct resource *rsrc,
+			  resource_size_t *start, resource_size_t *end)
+{
+	struct pci_bus_region region;
+
+	if (rsrc->flags & IORESOURCE_IO) {
+		pcibios_resource_to_bus(dev->bus, &region,
+					(struct resource *) rsrc);
+		*start = region.start;
+		*end = region.end;
+		return;
+	}
+
+	/* We pass a CPU physical address to userland for MMIO instead of a
+	 * BAR value because X is lame and expects to be able to use that
+	 * to pass to /dev/mem!
+	 *
+	 * That means we may have 64-bit values where some apps only expect
+	 * 32 (like X itself since it thinks only Sparc has 64-bit MMIO).
+	 */
+	*start = rsrc->start;
+	*end = rsrc->end;
+}
+
+/**
+ * pci_process_bridge_OF_ranges - Parse PCI bridge resources from device tree
+ * @hose: newly allocated pci_controller to be setup
+ * @dev: device node of the host bridge
+ * @primary: set if primary bus (32 bits only, soon to be deprecated)
+ *
+ * This function will parse the "ranges" property of a PCI host bridge device
+ * node and setup the resource mapping of a pci controller based on its
+ * content.
+ *
+ * Life would be boring if it wasn't for a few issues that we have to deal
+ * with here:
+ *
+ *   - We can only cope with one IO space range and up to 3 Memory space
+ *     ranges. However, some machines (thanks Apple !) tend to split their
+ *     space into lots of small contiguous ranges. So we have to coalesce.
+ *
+ *   - Some busses have IO space not starting at 0, which causes trouble with
+ *     the way we do our IO resource renumbering. The code somewhat deals with
+ *     it for 64 bits but I would expect problems on 32 bits.
+ *
+ *   - Some 32 bits platforms such as 4xx can have physical space larger than
+ *     32 bits so we need to use 64 bits values for the parsing
+ */
+void pci_process_bridge_OF_ranges(struct pci_controller *hose,
+				  struct device_node *dev, int primary)
+{
+	int memno = 0;
+	struct resource *res;
+	struct of_pci_range range;
+	struct of_pci_range_parser parser;
+
+	printk(KERN_INFO "PCI host bridge %pOF %s ranges:\n",
+	       dev, primary ? "(primary)" : "");
+
+	/* Check for ranges property */
+	if (of_pci_range_parser_init(&parser, dev))
+		return;
+
+	/* Parse it */
+	for_each_of_pci_range(&parser, &range) {
+		/* If we failed translation or got a zero-sized region
+		 * (some FW try to feed us with non sensical zero sized regions
+		 * such as power3 which look like some kind of attempt at exposing
+		 * the VGA memory hole)
+		 */
+		if (range.cpu_addr == OF_BAD_ADDR || range.size == 0)
+			continue;
+
+		/* Act based on address space type */
+		res = NULL;
+		switch (range.flags & IORESOURCE_TYPE_BITS) {
+		case IORESOURCE_IO:
+			printk(KERN_INFO
+			       "  IO 0x%016llx..0x%016llx -> 0x%016llx\n",
+			       range.cpu_addr, range.cpu_addr + range.size - 1,
+			       range.pci_addr);
+
+			/* We support only one IO range */
+			if (hose->pci_io_size) {
+				printk(KERN_INFO
+				       " \\--> Skipped (too many) !\n");
+				continue;
+			}
+#ifdef CONFIG_PPC32
+			/* On 32 bits, limit I/O space to 16MB */
+			if (range.size > 0x01000000)
+				range.size = 0x01000000;
+
+			/* 32 bits needs to map IOs here */
+			hose->io_base_virt = ioremap(range.cpu_addr,
+						range.size);
+
+			/* Expect trouble if pci_addr is not 0 */
+			if (primary)
+				isa_io_base =
+					(unsigned long)hose->io_base_virt;
+#endif /* CONFIG_PPC32 */
+			/* pci_io_size and io_base_phys always represent IO
+			 * space starting at 0 so we factor in pci_addr
+			 */
+			hose->pci_io_size = range.pci_addr + range.size;
+			hose->io_base_phys = range.cpu_addr - range.pci_addr;
+
+			/* Build resource */
+			res = &hose->io_resource;
+			range.cpu_addr = range.pci_addr;
+			break;
+		case IORESOURCE_MEM:
+			printk(KERN_INFO
+			       " MEM 0x%016llx..0x%016llx -> 0x%016llx %s\n",
+			       range.cpu_addr, range.cpu_addr + range.size - 1,
+			       range.pci_addr,
+			       (range.pci_space & 0x40000000) ?
+			       "Prefetch" : "");
+
+			/* We support only 3 memory ranges */
+			if (memno >= 3) {
+				printk(KERN_INFO
+				       " \\--> Skipped (too many) !\n");
+				continue;
+			}
+			/* Handles ISA memory hole space here */
+			if (range.pci_addr == 0) {
+				if (primary || isa_mem_base == 0)
+					isa_mem_base = range.cpu_addr;
+				hose->isa_mem_phys = range.cpu_addr;
+				hose->isa_mem_size = range.size;
+			}
+
+			/* Build resource */
+			hose->mem_offset[memno] = range.cpu_addr -
+							range.pci_addr;
+			res = &hose->mem_resources[memno++];
+			break;
+		}
+		if (res != NULL) {
+			res->name = dev->full_name;
+			res->flags = range.flags;
+			res->start = range.cpu_addr;
+			res->end = range.cpu_addr + range.size - 1;
+			res->parent = res->child = res->sibling = NULL;
+		}
+	}
+}
+
+/* Decide whether to display the domain number in /proc */
+int pci_proc_domain(struct pci_bus *bus)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+
+	if (!pci_has_flag(PCI_ENABLE_PROC_DOMAINS))
+		return 0;
+	if (pci_has_flag(PCI_COMPAT_DOMAIN_0))
+		return hose->global_number != 0;
+	return 1;
+}
+
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+	if (ppc_md.pcibios_root_bridge_prepare)
+		return ppc_md.pcibios_root_bridge_prepare(bridge);
+
+	return 0;
+}
+
+/* This header fixup will do the resource fixup for all devices as they are
+ * probed, but not for bridge ranges
+ */
+static void pcibios_fixup_resources(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	int i;
+
+	if (!hose) {
+		printk(KERN_ERR "No host bridge for PCI dev %s !\n",
+		       pci_name(dev));
+		return;
+	}
+
+	if (dev->is_virtfn)
+		return;
+
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+		struct resource *res = dev->resource + i;
+		struct pci_bus_region reg;
+		if (!res->flags)
+			continue;
+
+		/* If we're going to re-assign everything, we mark all resources
+		 * as unset (and 0-base them). In addition, we mark BARs starting
+		 * at 0 as unset as well, except if PCI_PROBE_ONLY is also set
+		 * since in that case, we don't want to re-assign anything
+		 */
+		pcibios_resource_to_bus(dev->bus, &reg, res);
+		if (pci_has_flag(PCI_REASSIGN_ALL_RSRC) ||
+		    (reg.start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) {
+			/* Only print message if not re-assigning */
+			if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC))
+				pr_debug("PCI:%s Resource %d %pR is unassigned\n",
+					 pci_name(dev), i, res);
+			res->end -= res->start;
+			res->start = 0;
+			res->flags |= IORESOURCE_UNSET;
+			continue;
+		}
+
+		pr_debug("PCI:%s Resource %d %pR\n", pci_name(dev), i, res);
+	}
+
+	/* Call machine specific resource fixup */
+	if (ppc_md.pcibios_fixup_resources)
+		ppc_md.pcibios_fixup_resources(dev);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_resources);
+
+/* This function tries to figure out if a bridge resource has been initialized
+ * by the firmware or not. It doesn't have to be absolutely bullet proof, but
+ * things go more smoothly when it gets it right. It should covers cases such
+ * as Apple "closed" bridge resources and bare-metal pSeries unassigned bridges
+ */
+static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
+						 struct resource *res)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pci_dev *dev = bus->self;
+	resource_size_t offset;
+	struct pci_bus_region region;
+	u16 command;
+	int i;
+
+	/* We don't do anything if PCI_PROBE_ONLY is set */
+	if (pci_has_flag(PCI_PROBE_ONLY))
+		return 0;
+
+	/* Job is a bit different between memory and IO */
+	if (res->flags & IORESOURCE_MEM) {
+		pcibios_resource_to_bus(dev->bus, &region, res);
+
+		/* If the BAR is non-0 then it's probably been initialized */
+		if (region.start != 0)
+			return 0;
+
+		/* The BAR is 0, let's check if memory decoding is enabled on
+		 * the bridge. If not, we consider it unassigned
+		 */
+		pci_read_config_word(dev, PCI_COMMAND, &command);
+		if ((command & PCI_COMMAND_MEMORY) == 0)
+			return 1;
+
+		/* Memory decoding is enabled and the BAR is 0. If any of the bridge
+		 * resources covers that starting address (0 then it's good enough for
+		 * us for memory space)
+		 */
+		for (i = 0; i < 3; i++) {
+			if ((hose->mem_resources[i].flags & IORESOURCE_MEM) &&
+			    hose->mem_resources[i].start == hose->mem_offset[i])
+				return 0;
+		}
+
+		/* Well, it starts at 0 and we know it will collide so we may as
+		 * well consider it as unassigned. That covers the Apple case.
+		 */
+		return 1;
+	} else {
+		/* If the BAR is non-0, then we consider it assigned */
+		offset = (unsigned long)hose->io_base_virt - _IO_BASE;
+		if (((res->start - offset) & 0xfffffffful) != 0)
+			return 0;
+
+		/* Here, we are a bit different than memory as typically IO space
+		 * starting at low addresses -is- valid. What we do instead if that
+		 * we consider as unassigned anything that doesn't have IO enabled
+		 * in the PCI command register, and that's it.
+		 */
+		pci_read_config_word(dev, PCI_COMMAND, &command);
+		if (command & PCI_COMMAND_IO)
+			return 0;
+
+		/* It's starting at 0 and IO is disabled in the bridge, consider
+		 * it unassigned
+		 */
+		return 1;
+	}
+}
+
+/* Fixup resources of a PCI<->PCI bridge */
+static void pcibios_fixup_bridge(struct pci_bus *bus)
+{
+	struct resource *res;
+	int i;
+
+	struct pci_dev *dev = bus->self;
+
+	pci_bus_for_each_resource(bus, res, i) {
+		if (!res || !res->flags)
+			continue;
+		if (i >= 3 && bus->self->transparent)
+			continue;
+
+		/* If we're going to reassign everything, we can
+		 * shrink the P2P resource to have size as being
+		 * of 0 in order to save space.
+		 */
+		if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) {
+			res->flags |= IORESOURCE_UNSET;
+			res->start = 0;
+			res->end = -1;
+			continue;
+		}
+
+		pr_debug("PCI:%s Bus rsrc %d %pR\n", pci_name(dev), i, res);
+
+		/* Try to detect uninitialized P2P bridge resources,
+		 * and clear them out so they get re-assigned later
+		 */
+		if (pcibios_uninitialized_bridge_resource(bus, res)) {
+			res->flags = 0;
+			pr_debug("PCI:%s            (unassigned)\n", pci_name(dev));
+		}
+	}
+}
+
+void pcibios_setup_bus_self(struct pci_bus *bus)
+{
+	struct pci_controller *phb;
+
+	/* Fix up the bus resources for P2P bridges */
+	if (bus->self != NULL)
+		pcibios_fixup_bridge(bus);
+
+	/* Platform specific bus fixups. This is currently only used
+	 * by fsl_pci and I'm hoping to get rid of it at some point
+	 */
+	if (ppc_md.pcibios_fixup_bus)
+		ppc_md.pcibios_fixup_bus(bus);
+
+	/* Setup bus DMA mappings */
+	phb = pci_bus_to_host(bus);
+	if (phb->controller_ops.dma_bus_setup)
+		phb->controller_ops.dma_bus_setup(bus);
+}
+
+static void pcibios_setup_device(struct pci_dev *dev)
+{
+	struct pci_controller *phb;
+	/* Fixup NUMA node as it may not be setup yet by the generic
+	 * code and is needed by the DMA init
+	 */
+	set_dev_node(&dev->dev, pcibus_to_node(dev->bus));
+
+	/* Hook up default DMA ops */
+	set_dma_ops(&dev->dev, pci_dma_ops);
+	set_dma_offset(&dev->dev, PCI_DRAM_OFFSET);
+
+	/* Additional platform DMA/iommu setup */
+	phb = pci_bus_to_host(dev->bus);
+	if (phb->controller_ops.dma_dev_setup)
+		phb->controller_ops.dma_dev_setup(dev);
+
+	/* Read default IRQs and fixup if necessary */
+	pci_read_irq_line(dev);
+	if (ppc_md.pci_irq_fixup)
+		ppc_md.pci_irq_fixup(dev);
+}
+
+int pcibios_add_device(struct pci_dev *dev)
+{
+	/*
+	 * We can only call pcibios_setup_device() after bus setup is complete,
+	 * since some of the platform specific DMA setup code depends on it.
+	 */
+	if (dev->bus->is_added)
+		pcibios_setup_device(dev);
+
+#ifdef CONFIG_PCI_IOV
+	if (ppc_md.pcibios_fixup_sriov)
+		ppc_md.pcibios_fixup_sriov(dev);
+#endif /* CONFIG_PCI_IOV */
+
+	return 0;
+}
+
+void pcibios_setup_bus_devices(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	pr_debug("PCI: Fixup bus devices %d (%s)\n",
+		 bus->number, bus->self ? pci_name(bus->self) : "PHB");
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		/* Cardbus can call us to add new devices to a bus, so ignore
+		 * those who are already fully discovered
+		 */
+		if (pci_dev_is_added(dev))
+			continue;
+
+		pcibios_setup_device(dev);
+	}
+}
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	/* No special bus mastering setup handling */
+}
+
+void pcibios_fixup_bus(struct pci_bus *bus)
+{
+	/* When called from the generic PCI probe, read PCI<->PCI bridge
+	 * bases. This is -not- called when generating the PCI tree from
+	 * the OF device-tree.
+	 */
+	pci_read_bridge_bases(bus);
+
+	/* Now fixup the bus bus */
+	pcibios_setup_bus_self(bus);
+
+	/* Now fixup devices on that bus */
+	pcibios_setup_bus_devices(bus);
+}
+EXPORT_SYMBOL(pcibios_fixup_bus);
+
+void pci_fixup_cardbus(struct pci_bus *bus)
+{
+	/* Now fixup devices on that bus */
+	pcibios_setup_bus_devices(bus);
+}
+
+
+static int skip_isa_ioresource_align(struct pci_dev *dev)
+{
+	if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) &&
+	    !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA))
+		return 1;
+	return 0;
+}
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ *
+ * Why? Because some silly external IO cards only decode
+ * the low 10 bits of the IO address. The 0x00-0xff region
+ * is reserved for motherboard devices that decode all 16
+ * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
+ * but we want to try to avoid allocating at 0x2900-0x2bff
+ * which might have be mirrored at 0x0100-0x03ff..
+ */
+resource_size_t pcibios_align_resource(void *data, const struct resource *res,
+				resource_size_t size, resource_size_t align)
+{
+	struct pci_dev *dev = data;
+	resource_size_t start = res->start;
+
+	if (res->flags & IORESOURCE_IO) {
+		if (skip_isa_ioresource_align(dev))
+			return start;
+		if (start & 0x300)
+			start = (start + 0x3ff) & ~0x3ff;
+	}
+
+	return start;
+}
+EXPORT_SYMBOL(pcibios_align_resource);
+
+/*
+ * Reparent resource children of pr that conflict with res
+ * under res, and make res replace those children.
+ */
+static int reparent_resources(struct resource *parent,
+				     struct resource *res)
+{
+	struct resource *p, **pp;
+	struct resource **firstpp = NULL;
+
+	for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
+		if (p->end < res->start)
+			continue;
+		if (res->end < p->start)
+			break;
+		if (p->start < res->start || p->end > res->end)
+			return -1;	/* not completely contained */
+		if (firstpp == NULL)
+			firstpp = pp;
+	}
+	if (firstpp == NULL)
+		return -1;	/* didn't find any conflicting entries? */
+	res->parent = parent;
+	res->child = *firstpp;
+	res->sibling = *pp;
+	*firstpp = res;
+	*pp = NULL;
+	for (p = res->child; p != NULL; p = p->sibling) {
+		p->parent = res;
+		pr_debug("PCI: Reparented %s %pR under %s\n",
+			 p->name, p, res->name);
+	}
+	return 0;
+}
+
+/*
+ *  Handle resources of PCI devices.  If the world were perfect, we could
+ *  just allocate all the resource regions and do nothing more.  It isn't.
+ *  On the other hand, we cannot just re-allocate all devices, as it would
+ *  require us to know lots of host bridge internals.  So we attempt to
+ *  keep as much of the original configuration as possible, but tweak it
+ *  when it's found to be wrong.
+ *
+ *  Known BIOS problems we have to work around:
+ *	- I/O or memory regions not configured
+ *	- regions configured, but not enabled in the command register
+ *	- bogus I/O addresses above 64K used
+ *	- expansion ROMs left enabled (this may sound harmless, but given
+ *	  the fact the PCI specs explicitly allow address decoders to be
+ *	  shared between expansion ROMs and other resource regions, it's
+ *	  at least dangerous)
+ *
+ *  Our solution:
+ *	(1) Allocate resources for all buses behind PCI-to-PCI bridges.
+ *	    This gives us fixed barriers on where we can allocate.
+ *	(2) Allocate resources for all enabled devices.  If there is
+ *	    a collision, just mark the resource as unallocated. Also
+ *	    disable expansion ROMs during this step.
+ *	(3) Try to allocate resources for disabled devices.  If the
+ *	    resources were assigned correctly, everything goes well,
+ *	    if they weren't, they won't disturb allocation of other
+ *	    resources.
+ *	(4) Assign new addresses to resources which were either
+ *	    not configured at all or misconfigured.  If explicitly
+ *	    requested by the user, configure expansion ROM address
+ *	    as well.
+ */
+
+static void pcibios_allocate_bus_resources(struct pci_bus *bus)
+{
+	struct pci_bus *b;
+	int i;
+	struct resource *res, *pr;
+
+	pr_debug("PCI: Allocating bus resources for %04x:%02x...\n",
+		 pci_domain_nr(bus), bus->number);
+
+	pci_bus_for_each_resource(bus, res, i) {
+		if (!res || !res->flags || res->start > res->end || res->parent)
+			continue;
+
+		/* If the resource was left unset at this point, we clear it */
+		if (res->flags & IORESOURCE_UNSET)
+			goto clear_resource;
+
+		if (bus->parent == NULL)
+			pr = (res->flags & IORESOURCE_IO) ?
+				&ioport_resource : &iomem_resource;
+		else {
+			pr = pci_find_parent_resource(bus->self, res);
+			if (pr == res) {
+				/* this happens when the generic PCI
+				 * code (wrongly) decides that this
+				 * bridge is transparent  -- paulus
+				 */
+				continue;
+			}
+		}
+
+		pr_debug("PCI: %s (bus %d) bridge rsrc %d: %pR, parent %p (%s)\n",
+			 bus->self ? pci_name(bus->self) : "PHB", bus->number,
+			 i, res, pr, (pr && pr->name) ? pr->name : "nil");
+
+		if (pr && !(pr->flags & IORESOURCE_UNSET)) {
+			struct pci_dev *dev = bus->self;
+
+			if (request_resource(pr, res) == 0)
+				continue;
+			/*
+			 * Must be a conflict with an existing entry.
+			 * Move that entry (or entries) under the
+			 * bridge resource and try again.
+			 */
+			if (reparent_resources(pr, res) == 0)
+				continue;
+
+			if (dev && i < PCI_BRIDGE_RESOURCE_NUM &&
+			    pci_claim_bridge_resource(dev,
+						i + PCI_BRIDGE_RESOURCES) == 0)
+				continue;
+		}
+		pr_warn("PCI: Cannot allocate resource region %d of PCI bridge %d, will remap\n",
+			i, bus->number);
+	clear_resource:
+		/* The resource might be figured out when doing
+		 * reassignment based on the resources required
+		 * by the downstream PCI devices. Here we set
+		 * the size of the resource to be 0 in order to
+		 * save more space.
+		 */
+		res->start = 0;
+		res->end = -1;
+		res->flags = 0;
+	}
+
+	list_for_each_entry(b, &bus->children, node)
+		pcibios_allocate_bus_resources(b);
+}
+
+static inline void alloc_resource(struct pci_dev *dev, int idx)
+{
+	struct resource *pr, *r = &dev->resource[idx];
+
+	pr_debug("PCI: Allocating %s: Resource %d: %pR\n",
+		 pci_name(dev), idx, r);
+
+	pr = pci_find_parent_resource(dev, r);
+	if (!pr || (pr->flags & IORESOURCE_UNSET) ||
+	    request_resource(pr, r) < 0) {
+		printk(KERN_WARNING "PCI: Cannot allocate resource region %d"
+		       " of device %s, will remap\n", idx, pci_name(dev));
+		if (pr)
+			pr_debug("PCI:  parent is %p: %pR\n", pr, pr);
+		/* We'll assign a new address later */
+		r->flags |= IORESOURCE_UNSET;
+		r->end -= r->start;
+		r->start = 0;
+	}
+}
+
+static void __init pcibios_allocate_resources(int pass)
+{
+	struct pci_dev *dev = NULL;
+	int idx, disabled;
+	u16 command;
+	struct resource *r;
+
+	for_each_pci_dev(dev) {
+		pci_read_config_word(dev, PCI_COMMAND, &command);
+		for (idx = 0; idx <= PCI_ROM_RESOURCE; idx++) {
+			r = &dev->resource[idx];
+			if (r->parent)		/* Already allocated */
+				continue;
+			if (!r->flags || (r->flags & IORESOURCE_UNSET))
+				continue;	/* Not assigned at all */
+			/* We only allocate ROMs on pass 1 just in case they
+			 * have been screwed up by firmware
+			 */
+			if (idx == PCI_ROM_RESOURCE )
+				disabled = 1;
+			if (r->flags & IORESOURCE_IO)
+				disabled = !(command & PCI_COMMAND_IO);
+			else
+				disabled = !(command & PCI_COMMAND_MEMORY);
+			if (pass == disabled)
+				alloc_resource(dev, idx);
+		}
+		if (pass)
+			continue;
+		r = &dev->resource[PCI_ROM_RESOURCE];
+		if (r->flags) {
+			/* Turn the ROM off, leave the resource region,
+			 * but keep it unregistered.
+			 */
+			u32 reg;
+			pci_read_config_dword(dev, dev->rom_base_reg, &reg);
+			if (reg & PCI_ROM_ADDRESS_ENABLE) {
+				pr_debug("PCI: Switching off ROM of %s\n",
+					 pci_name(dev));
+				r->flags &= ~IORESOURCE_ROM_ENABLE;
+				pci_write_config_dword(dev, dev->rom_base_reg,
+						       reg & ~PCI_ROM_ADDRESS_ENABLE);
+			}
+		}
+	}
+}
+
+static void __init pcibios_reserve_legacy_regions(struct pci_bus *bus)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	resource_size_t	offset;
+	struct resource *res, *pres;
+	int i;
+
+	pr_debug("Reserving legacy ranges for domain %04x\n", pci_domain_nr(bus));
+
+	/* Check for IO */
+	if (!(hose->io_resource.flags & IORESOURCE_IO))
+		goto no_io;
+	offset = (unsigned long)hose->io_base_virt - _IO_BASE;
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	BUG_ON(res == NULL);
+	res->name = "Legacy IO";
+	res->flags = IORESOURCE_IO;
+	res->start = offset;
+	res->end = (offset + 0xfff) & 0xfffffffful;
+	pr_debug("Candidate legacy IO: %pR\n", res);
+	if (request_resource(&hose->io_resource, res)) {
+		printk(KERN_DEBUG
+		       "PCI %04x:%02x Cannot reserve Legacy IO %pR\n",
+		       pci_domain_nr(bus), bus->number, res);
+		kfree(res);
+	}
+
+ no_io:
+	/* Check for memory */
+	for (i = 0; i < 3; i++) {
+		pres = &hose->mem_resources[i];
+		offset = hose->mem_offset[i];
+		if (!(pres->flags & IORESOURCE_MEM))
+			continue;
+		pr_debug("hose mem res: %pR\n", pres);
+		if ((pres->start - offset) <= 0xa0000 &&
+		    (pres->end - offset) >= 0xbffff)
+			break;
+	}
+	if (i >= 3)
+		return;
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	BUG_ON(res == NULL);
+	res->name = "Legacy VGA memory";
+	res->flags = IORESOURCE_MEM;
+	res->start = 0xa0000 + offset;
+	res->end = 0xbffff + offset;
+	pr_debug("Candidate VGA memory: %pR\n", res);
+	if (request_resource(pres, res)) {
+		printk(KERN_DEBUG
+		       "PCI %04x:%02x Cannot reserve VGA memory %pR\n",
+		       pci_domain_nr(bus), bus->number, res);
+		kfree(res);
+	}
+}
+
+void __init pcibios_resource_survey(void)
+{
+	struct pci_bus *b;
+
+	/* Allocate and assign resources */
+	list_for_each_entry(b, &pci_root_buses, node)
+		pcibios_allocate_bus_resources(b);
+	if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) {
+		pcibios_allocate_resources(0);
+		pcibios_allocate_resources(1);
+	}
+
+	/* Before we start assigning unassigned resource, we try to reserve
+	 * the low IO area and the VGA memory area if they intersect the
+	 * bus available resources to avoid allocating things on top of them
+	 */
+	if (!pci_has_flag(PCI_PROBE_ONLY)) {
+		list_for_each_entry(b, &pci_root_buses, node)
+			pcibios_reserve_legacy_regions(b);
+	}
+
+	/* Now, if the platform didn't decide to blindly trust the firmware,
+	 * we proceed to assigning things that were left unassigned
+	 */
+	if (!pci_has_flag(PCI_PROBE_ONLY)) {
+		pr_debug("PCI: Assigning unassigned resources...\n");
+		pci_assign_unassigned_resources();
+	}
+
+	/* Call machine dependent fixup */
+	if (ppc_md.pcibios_fixup)
+		ppc_md.pcibios_fixup();
+}
+
+/* This is used by the PCI hotplug driver to allocate resource
+ * of newly plugged busses. We can try to consolidate with the
+ * rest of the code later, for now, keep it as-is as our main
+ * resource allocation function doesn't deal with sub-trees yet.
+ */
+void pcibios_claim_one_bus(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+	struct pci_bus *child_bus;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		int i;
+
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *r = &dev->resource[i];
+
+			if (r->parent || !r->start || !r->flags)
+				continue;
+
+			pr_debug("PCI: Claiming %s: Resource %d: %pR\n",
+				 pci_name(dev), i, r);
+
+			if (pci_claim_resource(dev, i) == 0)
+				continue;
+
+			pci_claim_bridge_resource(dev, i);
+		}
+	}
+
+	list_for_each_entry(child_bus, &bus->children, node)
+		pcibios_claim_one_bus(child_bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_claim_one_bus);
+
+
+/* pcibios_finish_adding_to_bus
+ *
+ * This is to be called by the hotplug code after devices have been
+ * added to a bus, this include calling it for a PHB that is just
+ * being added
+ */
+void pcibios_finish_adding_to_bus(struct pci_bus *bus)
+{
+	pr_debug("PCI: Finishing adding to hotplug bus %04x:%02x\n",
+		 pci_domain_nr(bus), bus->number);
+
+	/* Allocate bus and devices resources */
+	pcibios_allocate_bus_resources(bus);
+	pcibios_claim_one_bus(bus);
+	if (!pci_has_flag(PCI_PROBE_ONLY)) {
+		if (bus->self)
+			pci_assign_unassigned_bridge_resources(bus->self);
+		else
+			pci_assign_unassigned_bus_resources(bus);
+	}
+
+	/* Fixup EEH */
+	eeh_add_device_tree_late(bus);
+
+	/* Add new devices to global lists.  Register in proc, sysfs. */
+	pci_bus_add_devices(bus);
+
+	/* sysfs files should only be added after devices are added */
+	eeh_add_sysfs_files(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus);
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (phb->controller_ops.enable_device_hook)
+		if (!phb->controller_ops.enable_device_hook(dev))
+			return -EINVAL;
+
+	return pci_enable_resources(dev, mask);
+}
+
+void pcibios_disable_device(struct pci_dev *dev)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (phb->controller_ops.disable_device)
+		phb->controller_ops.disable_device(dev);
+}
+
+resource_size_t pcibios_io_space_offset(struct pci_controller *hose)
+{
+	return (unsigned long) hose->io_base_virt - _IO_BASE;
+}
+
+static void pcibios_setup_phb_resources(struct pci_controller *hose,
+					struct list_head *resources)
+{
+	struct resource *res;
+	resource_size_t offset;
+	int i;
+
+	/* Hookup PHB IO resource */
+	res = &hose->io_resource;
+
+	if (!res->flags) {
+		pr_debug("PCI: I/O resource not set for host"
+			 " bridge %pOF (domain %d)\n",
+			 hose->dn, hose->global_number);
+	} else {
+		offset = pcibios_io_space_offset(hose);
+
+		pr_debug("PCI: PHB IO resource    = %pR off 0x%08llx\n",
+			 res, (unsigned long long)offset);
+		pci_add_resource_offset(resources, res, offset);
+	}
+
+	/* Hookup PHB Memory resources */
+	for (i = 0; i < 3; ++i) {
+		res = &hose->mem_resources[i];
+		if (!res->flags)
+			continue;
+
+		offset = hose->mem_offset[i];
+		pr_debug("PCI: PHB MEM resource %d = %pR off 0x%08llx\n", i,
+			 res, (unsigned long long)offset);
+
+		pci_add_resource_offset(resources, res, offset);
+	}
+}
+
+/*
+ * Null PCI config access functions, for the case when we can't
+ * find a hose.
+ */
+#define NULL_PCI_OP(rw, size, type)					\
+static int								\
+null_##rw##_config_##size(struct pci_dev *dev, int offset, type val)	\
+{									\
+	return PCIBIOS_DEVICE_NOT_FOUND;    				\
+}
+
+static int
+null_read_config(struct pci_bus *bus, unsigned int devfn, int offset,
+		 int len, u32 *val)
+{
+	return PCIBIOS_DEVICE_NOT_FOUND;
+}
+
+static int
+null_write_config(struct pci_bus *bus, unsigned int devfn, int offset,
+		  int len, u32 val)
+{
+	return PCIBIOS_DEVICE_NOT_FOUND;
+}
+
+static struct pci_ops null_pci_ops =
+{
+	.read = null_read_config,
+	.write = null_write_config,
+};
+
+/*
+ * These functions are used early on before PCI scanning is done
+ * and all of the pci_dev and pci_bus structures have been created.
+ */
+static struct pci_bus *
+fake_pci_bus(struct pci_controller *hose, int busnr)
+{
+	static struct pci_bus bus;
+
+	if (hose == NULL) {
+		printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr);
+	}
+	bus.number = busnr;
+	bus.sysdata = hose;
+	bus.ops = hose? hose->ops: &null_pci_ops;
+	return &bus;
+}
+
+#define EARLY_PCI_OP(rw, size, type)					\
+int early_##rw##_config_##size(struct pci_controller *hose, int bus,	\
+			       int devfn, int offset, type value)	\
+{									\
+	return pci_bus_##rw##_config_##size(fake_pci_bus(hose, bus),	\
+					    devfn, offset, value);	\
+}
+
+EARLY_PCI_OP(read, byte, u8 *)
+EARLY_PCI_OP(read, word, u16 *)
+EARLY_PCI_OP(read, dword, u32 *)
+EARLY_PCI_OP(write, byte, u8)
+EARLY_PCI_OP(write, word, u16)
+EARLY_PCI_OP(write, dword, u32)
+
+int early_find_capability(struct pci_controller *hose, int bus, int devfn,
+			  int cap)
+{
+	return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap);
+}
+
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+	struct pci_controller *hose = bus->sysdata;
+
+	return of_node_get(hose->dn);
+}
+
+/**
+ * pci_scan_phb - Given a pci_controller, setup and scan the PCI bus
+ * @hose: Pointer to the PCI host controller instance structure
+ */
+void pcibios_scan_phb(struct pci_controller *hose)
+{
+	LIST_HEAD(resources);
+	struct pci_bus *bus;
+	struct device_node *node = hose->dn;
+	int mode;
+
+	pr_debug("PCI: Scanning PHB %pOF\n", node);
+
+	/* Get some IO space for the new PHB */
+	pcibios_setup_phb_io_space(hose);
+
+	/* Wire up PHB bus resources */
+	pcibios_setup_phb_resources(hose, &resources);
+
+	hose->busn.start = hose->first_busno;
+	hose->busn.end	 = hose->last_busno;
+	hose->busn.flags = IORESOURCE_BUS;
+	pci_add_resource(&resources, &hose->busn);
+
+	/* Create an empty bus for the toplevel */
+	bus = pci_create_root_bus(hose->parent, hose->first_busno,
+				  hose->ops, hose, &resources);
+	if (bus == NULL) {
+		pr_err("Failed to create bus for PCI domain %04x\n",
+			hose->global_number);
+		pci_free_resource_list(&resources);
+		return;
+	}
+	hose->bus = bus;
+
+	/* Get probe mode and perform scan */
+	mode = PCI_PROBE_NORMAL;
+	if (node && hose->controller_ops.probe_mode)
+		mode = hose->controller_ops.probe_mode(bus);
+	pr_debug("    probe mode: %d\n", mode);
+	if (mode == PCI_PROBE_DEVTREE)
+		of_scan_bus(node, bus);
+
+	if (mode == PCI_PROBE_NORMAL) {
+		pci_bus_update_busn_res_end(bus, 255);
+		hose->last_busno = pci_scan_child_bus(bus);
+		pci_bus_update_busn_res_end(bus, hose->last_busno);
+	}
+
+	/* Platform gets a chance to do some global fixups before
+	 * we proceed to resource allocation
+	 */
+	if (ppc_md.pcibios_fixup_phb)
+		ppc_md.pcibios_fixup_phb(hose);
+
+	/* Configure PCI Express settings */
+	if (bus && !pci_has_flag(PCI_PROBE_ONLY)) {
+		struct pci_bus *child;
+		list_for_each_entry(child, &bus->children, node)
+			pcie_bus_configure_settings(child);
+	}
+}
+EXPORT_SYMBOL_GPL(pcibios_scan_phb);
+
+static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
+{
+	int i, class = dev->class >> 8;
+	/* When configured as agent, programing interface = 1 */
+	int prog_if = dev->class & 0xf;
+
+	if ((class == PCI_CLASS_PROCESSOR_POWERPC ||
+	     class == PCI_CLASS_BRIDGE_OTHER) &&
+		(dev->hdr_type == PCI_HEADER_TYPE_NORMAL) &&
+		(prog_if == 0) &&
+		(dev->bus->parent == NULL)) {
+		for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+			dev->resource[i].start = 0;
+			dev->resource[i].end = 0;
+			dev->resource[i].flags = 0;
+		}
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl);
+
+
+static int __init discover_phbs(void)
+{
+	if (ppc_md.discover_phbs)
+		ppc_md.discover_phbs();
+
+	return 0;
+}
+core_initcall(discover_phbs);
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
new file mode 100644
index 000000000..cf47b1aec
--- /dev/null
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -0,0 +1,150 @@
+/*
+ * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c"
+ *
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+static struct pci_bus *find_bus_among_children(struct pci_bus *bus,
+					       struct device_node *dn)
+{
+	struct pci_bus *child = NULL;
+	struct pci_bus *tmp;
+
+	if (pci_bus_to_OF_node(bus) == dn)
+		return bus;
+
+	list_for_each_entry(tmp, &bus->children, node) {
+		child = find_bus_among_children(tmp, dn);
+		if (child)
+			break;
+	}
+
+	return child;
+}
+
+struct pci_bus *pci_find_bus_by_node(struct device_node *dn)
+{
+	struct pci_dn *pdn = PCI_DN(dn);
+
+	if (!pdn  || !pdn->phb || !pdn->phb->bus)
+		return NULL;
+
+	return find_bus_among_children(pdn->phb->bus, dn);
+}
+EXPORT_SYMBOL_GPL(pci_find_bus_by_node);
+
+/**
+ * pcibios_release_device - release PCI device
+ * @dev: PCI device
+ *
+ * The function is called before releasing the indicated PCI device.
+ */
+void pcibios_release_device(struct pci_dev *dev)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	eeh_remove_device(dev);
+
+	if (phb->controller_ops.release_device)
+		phb->controller_ops.release_device(dev);
+}
+
+/**
+ * pci_hp_remove_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pci_hp_remove_devices(struct pci_bus *bus)
+{
+	struct pci_dev *dev, *tmp;
+	struct pci_bus *child_bus;
+
+	/* First go down child busses */
+	list_for_each_entry(child_bus, &bus->children, node)
+		pci_hp_remove_devices(child_bus);
+
+	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+		 pci_domain_nr(bus),  bus->number);
+	list_for_each_entry_safe_reverse(dev, tmp, &bus->devices, bus_list) {
+		pr_debug("   Removing %s...\n", pci_name(dev));
+		pci_stop_and_remove_bus_device(dev);
+	}
+}
+EXPORT_SYMBOL_GPL(pci_hp_remove_devices);
+
+/**
+ * pci_hp_add_devices - adds new pci devices to bus
+ * @bus: the indicated PCI bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices.  (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pci_hp_add_devices(struct pci_bus *bus)
+{
+	int slotno, mode, max;
+	struct pci_dev *dev;
+	struct pci_controller *phb;
+	struct device_node *dn = pci_bus_to_OF_node(bus);
+
+	eeh_add_device_tree_early(PCI_DN(dn));
+
+	phb = pci_bus_to_host(bus);
+
+	mode = PCI_PROBE_NORMAL;
+	if (phb->controller_ops.probe_mode)
+		mode = phb->controller_ops.probe_mode(bus);
+
+	if (mode == PCI_PROBE_DEVTREE) {
+		/* use ofdt-based probe */
+		of_rescan_bus(dn, bus);
+	} else if (mode == PCI_PROBE_NORMAL &&
+		   dn->child && PCI_DN(dn->child)) {
+		/*
+		 * Use legacy probe. In the partial hotplug case, we
+		 * probably have grandchildren devices unplugged. So
+		 * we don't check the return value from pci_scan_slot() in
+		 * order for fully rescan all the way down to pick them up.
+		 * They can have been removed during partial hotplug.
+		 */
+		slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+		pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+		pcibios_setup_bus_devices(bus);
+		max = bus->busn_res.start;
+		/*
+		 * Scan bridges that are already configured. We don't touch
+		 * them unless they are misconfigured (which will be done in
+		 * the second scan below).
+		 */
+		for_each_pci_bridge(dev, bus)
+			max = pci_scan_bridge(bus, dev, max, 0);
+
+		/* Scan bridges that need to be reconfigured */
+		for_each_pci_bridge(dev, bus)
+			max = pci_scan_bridge(bus, dev, max, 1);
+	}
+	pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pci_hp_add_devices);
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
new file mode 100644
index 000000000..d63b488d3
--- /dev/null
+++ b/arch/powerpc/kernel/pci_32.c
@@ -0,0 +1,312 @@
+/*
+ * Common pmac/prep/chrp pci routines. -- Cort
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/sections.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/byteorder.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+
+#undef DEBUG
+
+unsigned long isa_io_base     = 0;
+unsigned long pci_dram_offset = 0;
+int pcibios_assign_bus_offset = 1;
+EXPORT_SYMBOL(isa_io_base);
+EXPORT_SYMBOL(pci_dram_offset);
+
+void pcibios_make_OF_bus_map(void);
+
+static void fixup_cpc710_pci64(struct pci_dev* dev);
+static u8* pci_to_OF_bus_map;
+
+/* By default, we don't re-assign bus numbers. We do this only on
+ * some pmacs
+ */
+static int pci_assign_all_buses;
+
+static int pci_bus_count;
+
+/* This will remain NULL for now, until isa-bridge.c is made common
+ * to both 32-bit and 64-bit.
+ */
+struct pci_dev *isa_bridge_pcidev;
+EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
+
+static void
+fixup_cpc710_pci64(struct pci_dev* dev)
+{
+	/* Hide the PCI64 BARs from the kernel as their content doesn't
+	 * fit well in the resource management
+	 */
+	dev->resource[0].start = dev->resource[0].end = 0;
+	dev->resource[0].flags = 0;
+	dev->resource[1].start = dev->resource[1].end = 0;
+	dev->resource[1].flags = 0;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM,	PCI_DEVICE_ID_IBM_CPC710_PCI64,	fixup_cpc710_pci64);
+
+/*
+ * Functions below are used on OpenFirmware machines.
+ */
+static void
+make_one_node_map(struct device_node* node, u8 pci_bus)
+{
+	const int *bus_range;
+	int len;
+
+	if (pci_bus >= pci_bus_count)
+		return;
+	bus_range = of_get_property(node, "bus-range", &len);
+	if (bus_range == NULL || len < 2 * sizeof(int)) {
+		printk(KERN_WARNING "Can't get bus-range for %pOF, "
+		       "assuming it starts at 0\n", node);
+		pci_to_OF_bus_map[pci_bus] = 0;
+	} else
+		pci_to_OF_bus_map[pci_bus] = bus_range[0];
+
+	for_each_child_of_node(node, node) {
+		struct pci_dev* dev;
+		const unsigned int *class_code, *reg;
+	
+		class_code = of_get_property(node, "class-code", NULL);
+		if (!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI &&
+			(*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS))
+			continue;
+		reg = of_get_property(node, "reg", NULL);
+		if (!reg)
+			continue;
+		dev = pci_get_domain_bus_and_slot(0, pci_bus,
+						  ((reg[0] >> 8) & 0xff));
+		if (!dev || !dev->subordinate) {
+			pci_dev_put(dev);
+			continue;
+		}
+		make_one_node_map(node, dev->subordinate->number);
+		pci_dev_put(dev);
+	}
+}
+	
+void
+pcibios_make_OF_bus_map(void)
+{
+	int i;
+	struct pci_controller *hose, *tmp;
+	struct property *map_prop;
+	struct device_node *dn;
+
+	pci_to_OF_bus_map = kmalloc(pci_bus_count, GFP_KERNEL);
+	if (!pci_to_OF_bus_map) {
+		printk(KERN_ERR "Can't allocate OF bus map !\n");
+		return;
+	}
+
+	/* We fill the bus map with invalid values, that helps
+	 * debugging.
+	 */
+	for (i=0; i<pci_bus_count; i++)
+		pci_to_OF_bus_map[i] = 0xff;
+
+	/* For each hose, we begin searching bridges */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		struct device_node* node = hose->dn;
+
+		if (!node)
+			continue;
+		make_one_node_map(node, hose->first_busno);
+	}
+	dn = of_find_node_by_path("/");
+	map_prop = of_find_property(dn, "pci-OF-bus-map", NULL);
+	if (map_prop) {
+		BUG_ON(pci_bus_count > map_prop->length);
+		memcpy(map_prop->value, pci_to_OF_bus_map, pci_bus_count);
+	}
+	of_node_put(dn);
+#ifdef DEBUG
+	printk("PCI->OF bus map:\n");
+	for (i=0; i<pci_bus_count; i++) {
+		if (pci_to_OF_bus_map[i] == 0xff)
+			continue;
+		printk("%d -> %d\n", i, pci_to_OF_bus_map[i]);
+	}
+#endif
+}
+
+
+/*
+ * Returns the PCI device matching a given OF node
+ */
+int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn)
+{
+	struct pci_dev *dev = NULL;
+	const __be32 *reg;
+	int size;
+
+	/* Check if it might have a chance to be a PCI device */
+	if (!pci_find_hose_for_OF_device(node))
+		return -ENODEV;
+
+	reg = of_get_property(node, "reg", &size);
+	if (!reg || size < 5 * sizeof(u32))
+		return -ENODEV;
+
+	*bus = (be32_to_cpup(&reg[0]) >> 16) & 0xff;
+	*devfn = (be32_to_cpup(&reg[0]) >> 8) & 0xff;
+
+	/* Ok, here we need some tweak. If we have already renumbered
+	 * all busses, we can't rely on the OF bus number any more.
+	 * the pci_to_OF_bus_map is not enough as several PCI busses
+	 * may match the same OF bus number.
+	 */
+	if (!pci_to_OF_bus_map)
+		return 0;
+
+	for_each_pci_dev(dev)
+		if (pci_to_OF_bus_map[dev->bus->number] == *bus &&
+				dev->devfn == *devfn) {
+			*bus = dev->bus->number;
+			pci_dev_put(dev);
+			return 0;
+		}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(pci_device_from_OF_node);
+
+/* We create the "pci-OF-bus-map" property now so it appears in the
+ * /proc device tree
+ */
+void __init
+pci_create_OF_bus_map(void)
+{
+	struct property* of_prop;
+	struct device_node *dn;
+
+	of_prop = memblock_virt_alloc(sizeof(struct property) + 256, 0);
+	dn = of_find_node_by_path("/");
+	if (dn) {
+		memset(of_prop, -1, sizeof(struct property) + 256);
+		of_prop->name = "pci-OF-bus-map";
+		of_prop->length = 256;
+		of_prop->value = &of_prop[1];
+		of_add_property(dn, of_prop);
+		of_node_put(dn);
+	}
+}
+
+void pcibios_setup_phb_io_space(struct pci_controller *hose)
+{
+	unsigned long io_offset;
+	struct resource *res = &hose->io_resource;
+
+	/* Fixup IO space offset */
+	io_offset = pcibios_io_space_offset(hose);
+	res->start += io_offset;
+	res->end += io_offset;
+}
+
+static int __init pcibios_init(void)
+{
+	struct pci_controller *hose, *tmp;
+	int next_busno = 0;
+
+	printk(KERN_INFO "PCI: Probing PCI hardware\n");
+
+	if (pci_has_flag(PCI_REASSIGN_ALL_BUS))
+		pci_assign_all_buses = 1;
+
+	/* Scan all of the recorded PCI controllers.  */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		if (pci_assign_all_buses)
+			hose->first_busno = next_busno;
+		hose->last_busno = 0xff;
+		pcibios_scan_phb(hose);
+		pci_bus_add_devices(hose->bus);
+		if (pci_assign_all_buses || next_busno <= hose->last_busno)
+			next_busno = hose->last_busno + pcibios_assign_bus_offset;
+	}
+	pci_bus_count = next_busno;
+
+	/* OpenFirmware based machines need a map of OF bus
+	 * numbers vs. kernel bus numbers since we may have to
+	 * remap them.
+	 */
+	if (pci_assign_all_buses)
+		pcibios_make_OF_bus_map();
+
+	/* Call common code to handle resource allocation */
+	pcibios_resource_survey();
+
+	/* Call machine dependent post-init code */
+	if (ppc_md.pcibios_after_init)
+		ppc_md.pcibios_after_init();
+
+	return 0;
+}
+
+subsys_initcall(pcibios_init);
+
+static struct pci_controller*
+pci_bus_to_hose(int bus)
+{
+	struct pci_controller *hose, *tmp;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
+		if (bus >= hose->first_busno && bus <= hose->last_busno)
+			return hose;
+	return NULL;
+}
+
+/* Provide information on locations of various I/O regions in physical
+ * memory.  Do this on a per-card basis so that we choose the right
+ * root bridge.
+ * Note that the returned IO or memory base is a physical address
+ */
+
+SYSCALL_DEFINE3(pciconfig_iobase, long, which,
+		unsigned long, bus, unsigned long, devfn)
+{
+	struct pci_controller* hose;
+	long result = -EOPNOTSUPP;
+
+	hose = pci_bus_to_hose(bus);
+	if (!hose)
+		return -ENODEV;
+
+	switch (which) {
+	case IOBASE_BRIDGE_NUMBER:
+		return (long)hose->first_busno;
+	case IOBASE_MEMORY:
+		return (long)hose->mem_offset[0];
+	case IOBASE_IO:
+		return (long)hose->io_base_phys;
+	case IOBASE_ISA_IO:
+		return (long)isa_io_base;
+	case IOBASE_ISA_MEM:
+		return (long)isa_mem_base;
+	}
+
+	return result;
+}
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
new file mode 100644
index 000000000..dff28f903
--- /dev/null
+++ b/arch/powerpc/kernel/pci_64.c
@@ -0,0 +1,267 @@
+/*
+ * Port for PPC64 David Engebretsen, IBM Corp.
+ * Contains common pci routines for ppc64 platform, pSeries and iSeries brands.
+ * 
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *   Rework, based on alpha PCI code.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/syscalls.h>
+#include <linux/irq.h>
+#include <linux/vmalloc.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/byteorder.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+
+/* pci_io_base -- the base address from which io bars are offsets.
+ * This is the lowest I/O base address (so bar values are always positive),
+ * and it *must* be the start of ISA space if an ISA bus exists because
+ * ISA drivers use hard coded offsets.  If no ISA bus exists nothing
+ * is mapped on the first 64K of IO space
+ */
+unsigned long pci_io_base;
+EXPORT_SYMBOL(pci_io_base);
+
+static int __init pcibios_init(void)
+{
+	struct pci_controller *hose, *tmp;
+
+	printk(KERN_INFO "PCI: Probing PCI hardware\n");
+
+	/* For now, override phys_mem_access_prot. If we need it,g
+	 * later, we may move that initialization to each ppc_md
+	 */
+	ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot;
+
+	/* On ppc64, we always enable PCI domains and we keep domain 0
+	 * backward compatible in /proc for video cards
+	 */
+	pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0);
+
+	/* Scan all of the recorded PCI controllers.  */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		pcibios_scan_phb(hose);
+		pci_bus_add_devices(hose->bus);
+	}
+
+	/* Call common code to handle resource allocation */
+	pcibios_resource_survey();
+
+	printk(KERN_DEBUG "PCI: Probing PCI hardware done\n");
+
+	return 0;
+}
+
+subsys_initcall(pcibios_init);
+
+int pcibios_unmap_io_space(struct pci_bus *bus)
+{
+	struct pci_controller *hose;
+
+	WARN_ON(bus == NULL);
+
+	/* If this is not a PHB, we only flush the hash table over
+	 * the area mapped by this bridge. We don't play with the PTE
+	 * mappings since we might have to deal with sub-page alignments
+	 * so flushing the hash table is the only sane way to make sure
+	 * that no hash entries are covering that removed bridge area
+	 * while still allowing other busses overlapping those pages
+	 *
+	 * Note: If we ever support P2P hotplug on Book3E, we'll have
+	 * to do an appropriate TLB flush here too
+	 */
+	if (bus->self) {
+#ifdef CONFIG_PPC_BOOK3S_64
+		struct resource *res = bus->resource[0];
+#endif
+
+		pr_debug("IO unmapping for PCI-PCI bridge %s\n",
+			 pci_name(bus->self));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+		__flush_hash_table_range(&init_mm, res->start + _IO_BASE,
+					 res->end + _IO_BASE + 1);
+#endif
+		return 0;
+	}
+
+	/* Get the host bridge */
+	hose = pci_bus_to_host(bus);
+
+	/* Check if we have IOs allocated */
+	if (hose->io_base_alloc == NULL)
+		return 0;
+
+	pr_debug("IO unmapping for PHB %pOF\n", hose->dn);
+	pr_debug("  alloc=0x%p\n", hose->io_base_alloc);
+
+	/* This is a PHB, we fully unmap the IO area */
+	vunmap(hose->io_base_alloc);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pcibios_unmap_io_space);
+
+static int pcibios_map_phb_io_space(struct pci_controller *hose)
+{
+	struct vm_struct *area;
+	unsigned long phys_page;
+	unsigned long size_page;
+	unsigned long io_virt_offset;
+
+	phys_page = _ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE);
+	size_page = _ALIGN_UP(hose->pci_io_size, PAGE_SIZE);
+
+	/* Make sure IO area address is clear */
+	hose->io_base_alloc = NULL;
+
+	/* If there's no IO to map on that bus, get away too */
+	if (hose->pci_io_size == 0 || hose->io_base_phys == 0)
+		return 0;
+
+	/* Let's allocate some IO space for that guy. We don't pass
+	 * VM_IOREMAP because we don't care about alignment tricks that
+	 * the core does in that case. Maybe we should due to stupid card
+	 * with incomplete address decoding but I'd rather not deal with
+	 * those outside of the reserved 64K legacy region.
+	 */
+	area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END);
+	if (area == NULL)
+		return -ENOMEM;
+	hose->io_base_alloc = area->addr;
+	hose->io_base_virt = (void __iomem *)(area->addr +
+					      hose->io_base_phys - phys_page);
+
+	pr_debug("IO mapping for PHB %pOF\n", hose->dn);
+	pr_debug("  phys=0x%016llx, virt=0x%p (alloc=0x%p)\n",
+		 hose->io_base_phys, hose->io_base_virt, hose->io_base_alloc);
+	pr_debug("  size=0x%016llx (alloc=0x%016lx)\n",
+		 hose->pci_io_size, size_page);
+
+	/* Establish the mapping */
+	if (__ioremap_at(phys_page, area->addr, size_page,
+			 pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)
+		return -ENOMEM;
+
+	/* Fixup hose IO resource */
+	io_virt_offset = pcibios_io_space_offset(hose);
+	hose->io_resource.start += io_virt_offset;
+	hose->io_resource.end += io_virt_offset;
+
+	pr_debug("  hose->io_resource=%pR\n", &hose->io_resource);
+
+	return 0;
+}
+
+int pcibios_map_io_space(struct pci_bus *bus)
+{
+	WARN_ON(bus == NULL);
+
+	/* If this not a PHB, nothing to do, page tables still exist and
+	 * thus HPTEs will be faulted in when needed
+	 */
+	if (bus->self) {
+		pr_debug("IO mapping for PCI-PCI bridge %s\n",
+			 pci_name(bus->self));
+		pr_debug("  virt=0x%016llx...0x%016llx\n",
+			 bus->resource[0]->start + _IO_BASE,
+			 bus->resource[0]->end + _IO_BASE);
+		return 0;
+	}
+
+	return pcibios_map_phb_io_space(pci_bus_to_host(bus));
+}
+EXPORT_SYMBOL_GPL(pcibios_map_io_space);
+
+void pcibios_setup_phb_io_space(struct pci_controller *hose)
+{
+	pcibios_map_phb_io_space(hose);
+}
+
+#define IOBASE_BRIDGE_NUMBER	0
+#define IOBASE_MEMORY		1
+#define IOBASE_IO		2
+#define IOBASE_ISA_IO		3
+#define IOBASE_ISA_MEM		4
+
+SYSCALL_DEFINE3(pciconfig_iobase, long, which, unsigned long, in_bus,
+			  unsigned long, in_devfn)
+{
+	struct pci_controller* hose;
+	struct pci_bus *tmp_bus, *bus = NULL;
+	struct device_node *hose_node;
+
+	/* Argh ! Please forgive me for that hack, but that's the
+	 * simplest way to get existing XFree to not lockup on some
+	 * G5 machines... So when something asks for bus 0 io base
+	 * (bus 0 is HT root), we return the AGP one instead.
+	 */
+	if (in_bus == 0 && of_machine_is_compatible("MacRISC4")) {
+		struct device_node *agp;
+
+		agp = of_find_compatible_node(NULL, NULL, "u3-agp");
+		if (agp)
+			in_bus = 0xf0;
+		of_node_put(agp);
+	}
+
+	/* That syscall isn't quite compatible with PCI domains, but it's
+	 * used on pre-domains setup. We return the first match
+	 */
+
+	list_for_each_entry(tmp_bus, &pci_root_buses, node) {
+		if (in_bus >= tmp_bus->number &&
+		    in_bus <= tmp_bus->busn_res.end) {
+			bus = tmp_bus;
+			break;
+		}
+	}
+	if (bus == NULL || bus->dev.of_node == NULL)
+		return -ENODEV;
+
+	hose_node = bus->dev.of_node;
+	hose = PCI_DN(hose_node)->phb;
+
+	switch (which) {
+	case IOBASE_BRIDGE_NUMBER:
+		return (long)hose->first_busno;
+	case IOBASE_MEMORY:
+		return (long)hose->mem_offset[0];
+	case IOBASE_IO:
+		return (long)hose->io_base_phys;
+	case IOBASE_ISA_IO:
+		return (long)isa_io_base;
+	case IOBASE_ISA_MEM:
+		return -EINVAL;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_NUMA
+int pcibus_to_node(struct pci_bus *bus)
+{
+	struct pci_controller *phb = pci_bus_to_host(bus);
+	return phb->node;
+}
+EXPORT_SYMBOL(pcibus_to_node);
+#endif
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
new file mode 100644
index 000000000..7cecc3bd9
--- /dev/null
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -0,0 +1,548 @@
+/*
+ * pci_dn.c
+ *
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * PCI manipulation via device_nodes.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *    
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+/*
+ * The function is used to find the firmware data of one
+ * specific PCI device, which is attached to the indicated
+ * PCI bus. For VFs, their firmware data is linked to that
+ * one of PF's bridge. For other devices, their firmware
+ * data is linked to that of their bridge.
+ */
+static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
+{
+	struct pci_bus *pbus;
+	struct device_node *dn;
+	struct pci_dn *pdn;
+
+	/*
+	 * We probably have virtual bus which doesn't
+	 * have associated bridge.
+	 */
+	pbus = bus;
+	while (pbus) {
+		if (pci_is_root_bus(pbus) || pbus->self)
+			break;
+
+		pbus = pbus->parent;
+	}
+
+	/*
+	 * Except virtual bus, all PCI buses should
+	 * have device nodes.
+	 */
+	dn = pci_bus_to_OF_node(pbus);
+	pdn = dn ? PCI_DN(dn) : NULL;
+
+	return pdn;
+}
+
+struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
+				    int devfn)
+{
+	struct device_node *dn = NULL;
+	struct pci_dn *parent, *pdn;
+	struct pci_dev *pdev = NULL;
+
+	/* Fast path: fetch from PCI device */
+	list_for_each_entry(pdev, &bus->devices, bus_list) {
+		if (pdev->devfn == devfn) {
+			if (pdev->dev.archdata.pci_data)
+				return pdev->dev.archdata.pci_data;
+
+			dn = pci_device_to_OF_node(pdev);
+			break;
+		}
+	}
+
+	/* Fast path: fetch from device node */
+	pdn = dn ? PCI_DN(dn) : NULL;
+	if (pdn)
+		return pdn;
+
+	/* Slow path: fetch from firmware data hierarchy */
+	parent = pci_bus_to_pdn(bus);
+	if (!parent)
+		return NULL;
+
+	list_for_each_entry(pdn, &parent->child_list, list) {
+		if (pdn->busno == bus->number &&
+                    pdn->devfn == devfn)
+                        return pdn;
+        }
+
+	return NULL;
+}
+
+struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
+{
+	struct device_node *dn;
+	struct pci_dn *parent, *pdn;
+
+	/* Search device directly */
+	if (pdev->dev.archdata.pci_data)
+		return pdev->dev.archdata.pci_data;
+
+	/* Check device node */
+	dn = pci_device_to_OF_node(pdev);
+	pdn = dn ? PCI_DN(dn) : NULL;
+	if (pdn)
+		return pdn;
+
+	/*
+	 * VFs don't have device nodes. We hook their
+	 * firmware data to PF's bridge.
+	 */
+	parent = pci_bus_to_pdn(pdev->bus);
+	if (!parent)
+		return NULL;
+
+	list_for_each_entry(pdn, &parent->child_list, list) {
+		if (pdn->busno == pdev->bus->number &&
+		    pdn->devfn == pdev->devfn)
+			return pdn;
+	}
+
+	return NULL;
+}
+
+#ifdef CONFIG_PCI_IOV
+static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
+					   int vf_index,
+					   int busno, int devfn)
+{
+	struct pci_dn *pdn;
+
+	/* Except PHB, we always have the parent */
+	if (!parent)
+		return NULL;
+
+	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+	if (!pdn)
+		return NULL;
+
+	pdn->phb = parent->phb;
+	pdn->parent = parent;
+	pdn->busno = busno;
+	pdn->devfn = devfn;
+	pdn->vf_index = vf_index;
+	pdn->pe_number = IODA_INVALID_PE;
+	INIT_LIST_HEAD(&pdn->child_list);
+	INIT_LIST_HEAD(&pdn->list);
+	list_add_tail(&pdn->list, &parent->child_list);
+
+	return pdn;
+}
+#endif
+
+struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+	struct pci_dn *parent, *pdn;
+	int i;
+
+	/* Only support IOV for now */
+	if (!pdev->is_physfn)
+		return pci_get_pdn(pdev);
+
+	/* Check if VFs have been populated */
+	pdn = pci_get_pdn(pdev);
+	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
+		return NULL;
+
+	pdn->flags |= PCI_DN_FLAG_IOV_VF;
+	parent = pci_bus_to_pdn(pdev->bus);
+	if (!parent)
+		return NULL;
+
+	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
+		struct eeh_dev *edev __maybe_unused;
+
+		pdn = add_one_dev_pci_data(parent, i,
+					   pci_iov_virtfn_bus(pdev, i),
+					   pci_iov_virtfn_devfn(pdev, i));
+		if (!pdn) {
+			dev_warn(&pdev->dev, "%s: Cannot create firmware data for VF#%d\n",
+				 __func__, i);
+			return NULL;
+		}
+
+#ifdef CONFIG_EEH
+		/* Create the EEH device for the VF */
+		edev = eeh_dev_init(pdn);
+		BUG_ON(!edev);
+		edev->physfn = pdev;
+#endif /* CONFIG_EEH */
+	}
+#endif /* CONFIG_PCI_IOV */
+
+	return pci_get_pdn(pdev);
+}
+
+void remove_dev_pci_data(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+	struct pci_dn *parent;
+	struct pci_dn *pdn, *tmp;
+	int i;
+
+	/*
+	 * VF and VF PE are created/released dynamically, so we need to
+	 * bind/unbind them.  Otherwise the VF and VF PE would be mismatched
+	 * when re-enabling SR-IOV.
+	 */
+	if (pdev->is_virtfn) {
+		pdn = pci_get_pdn(pdev);
+		pdn->pe_number = IODA_INVALID_PE;
+		return;
+	}
+
+	/* Only support IOV PF for now */
+	if (!pdev->is_physfn)
+		return;
+
+	/* Check if VFs have been populated */
+	pdn = pci_get_pdn(pdev);
+	if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF))
+		return;
+
+	pdn->flags &= ~PCI_DN_FLAG_IOV_VF;
+	parent = pci_bus_to_pdn(pdev->bus);
+	if (!parent)
+		return;
+
+	/*
+	 * We might introduce flag to pci_dn in future
+	 * so that we can release VF's firmware data in
+	 * a batch mode.
+	 */
+	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
+		struct eeh_dev *edev __maybe_unused;
+
+		list_for_each_entry_safe(pdn, tmp,
+			&parent->child_list, list) {
+			if (pdn->busno != pci_iov_virtfn_bus(pdev, i) ||
+			    pdn->devfn != pci_iov_virtfn_devfn(pdev, i))
+				continue;
+
+#ifdef CONFIG_EEH
+			/*
+			 * Release EEH state for this VF. The PCI core
+			 * has already torn down the pci_dev for this VF, but
+			 * we're responsible to removing the eeh_dev since it
+			 * has the same lifetime as the pci_dn that spawned it.
+			 */
+			edev = pdn_to_eeh_dev(pdn);
+			if (edev) {
+				/*
+				 * We allocate pci_dn's for the totalvfs count,
+				 * but only only the vfs that were activated
+				 * have a configured PE.
+				 */
+				if (edev->pe)
+					eeh_rmv_from_parent_pe(edev);
+
+				pdn->edev = NULL;
+				kfree(edev);
+			}
+#endif /* CONFIG_EEH */
+
+			if (!list_empty(&pdn->list))
+				list_del(&pdn->list);
+
+			kfree(pdn);
+		}
+	}
+#endif /* CONFIG_PCI_IOV */
+}
+
+struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
+					struct device_node *dn)
+{
+	const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL);
+	const __be32 *regs;
+	struct device_node *parent;
+	struct pci_dn *pdn;
+#ifdef CONFIG_EEH
+	struct eeh_dev *edev;
+#endif
+
+	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+	if (pdn == NULL)
+		return NULL;
+	dn->data = pdn;
+	pdn->phb = hose;
+	pdn->pe_number = IODA_INVALID_PE;
+	regs = of_get_property(dn, "reg", NULL);
+	if (regs) {
+		u32 addr = of_read_number(regs, 1);
+
+		/* First register entry is addr (00BBSS00)  */
+		pdn->busno = (addr >> 16) & 0xff;
+		pdn->devfn = (addr >> 8) & 0xff;
+	}
+
+	/* vendor/device IDs and class code */
+	regs = of_get_property(dn, "vendor-id", NULL);
+	pdn->vendor_id = regs ? of_read_number(regs, 1) : 0;
+	regs = of_get_property(dn, "device-id", NULL);
+	pdn->device_id = regs ? of_read_number(regs, 1) : 0;
+	regs = of_get_property(dn, "class-code", NULL);
+	pdn->class_code = regs ? of_read_number(regs, 1) : 0;
+
+	/* Extended config space */
+	pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1);
+
+	/* Create EEH device */
+#ifdef CONFIG_EEH
+	edev = eeh_dev_init(pdn);
+	if (!edev) {
+		kfree(pdn);
+		return NULL;
+	}
+#endif
+
+	/* Attach to parent node */
+	INIT_LIST_HEAD(&pdn->child_list);
+	INIT_LIST_HEAD(&pdn->list);
+	parent = of_get_parent(dn);
+	pdn->parent = parent ? PCI_DN(parent) : NULL;
+	if (pdn->parent)
+		list_add_tail(&pdn->list, &pdn->parent->child_list);
+
+	return pdn;
+}
+EXPORT_SYMBOL_GPL(pci_add_device_node_info);
+
+void pci_remove_device_node_info(struct device_node *dn)
+{
+	struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
+	struct device_node *parent;
+#ifdef CONFIG_EEH
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+	if (edev)
+		edev->pdn = NULL;
+#endif
+
+	if (!pdn)
+		return;
+
+	WARN_ON(!list_empty(&pdn->child_list));
+	list_del(&pdn->list);
+
+	parent = of_get_parent(dn);
+	if (parent)
+		of_node_put(parent);
+
+	dn->data = NULL;
+	kfree(pdn);
+}
+EXPORT_SYMBOL_GPL(pci_remove_device_node_info);
+
+/*
+ * Traverse a device tree stopping each PCI device in the tree.
+ * This is done depth first.  As each node is processed, a "pre"
+ * function is called and the children are processed recursively.
+ *
+ * The "pre" func returns a value.  If non-zero is returned from
+ * the "pre" func, the traversal stops and this value is returned.
+ * This return value is useful when using traverse as a method of
+ * finding a device.
+ *
+ * NOTE: we do not run the func for devices that do not appear to
+ * be PCI except for the start node which we assume (this is good
+ * because the start node is often a phb which may be missing PCI
+ * properties).
+ * We use the class-code as an indicator. If we run into
+ * one of these nodes we also assume its siblings are non-pci for
+ * performance.
+ */
+void *pci_traverse_device_nodes(struct device_node *start,
+				void *(*fn)(struct device_node *, void *),
+				void *data)
+{
+	struct device_node *dn, *nextdn;
+	void *ret;
+
+	/* We started with a phb, iterate all childs */
+	for (dn = start->child; dn; dn = nextdn) {
+		const __be32 *classp;
+		u32 class = 0;
+
+		nextdn = NULL;
+		classp = of_get_property(dn, "class-code", NULL);
+		if (classp)
+			class = of_read_number(classp, 1);
+
+		if (fn) {
+			ret = fn(dn, data);
+			if (ret)
+				return ret;
+		}
+
+		/* If we are a PCI bridge, go down */
+		if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI ||
+				  (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS))
+			/* Depth first...do children */
+			nextdn = dn->child;
+		else if (dn->sibling)
+			/* ok, try next sibling instead. */
+			nextdn = dn->sibling;
+		if (!nextdn) {
+			/* Walk up to next valid sibling. */
+			do {
+				dn = dn->parent;
+				if (dn == start)
+					return NULL;
+			} while (dn->sibling == NULL);
+			nextdn = dn->sibling;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_traverse_device_nodes);
+
+static struct pci_dn *pci_dn_next_one(struct pci_dn *root,
+				      struct pci_dn *pdn)
+{
+	struct list_head *next = pdn->child_list.next;
+
+	if (next != &pdn->child_list)
+		return list_entry(next, struct pci_dn, list);
+
+	while (1) {
+		if (pdn == root)
+			return NULL;
+
+		next = pdn->list.next;
+		if (next != &pdn->parent->child_list)
+			break;
+
+		pdn = pdn->parent;
+	}
+
+	return list_entry(next, struct pci_dn, list);
+}
+
+void *traverse_pci_dn(struct pci_dn *root,
+		      void *(*fn)(struct pci_dn *, void *),
+		      void *data)
+{
+	struct pci_dn *pdn = root;
+	void *ret;
+
+	/* Only scan the child nodes */
+	for (pdn = pci_dn_next_one(root, pdn); pdn;
+	     pdn = pci_dn_next_one(root, pdn)) {
+		ret = fn(pdn, data);
+		if (ret)
+			return ret;
+	}
+
+	return NULL;
+}
+
+static void *add_pdn(struct device_node *dn, void *data)
+{
+	struct pci_controller *hose = data;
+	struct pci_dn *pdn;
+
+	pdn = pci_add_device_node_info(hose, dn);
+	if (!pdn)
+		return ERR_PTR(-ENOMEM);
+
+	return NULL;
+}
+
+/** 
+ * pci_devs_phb_init_dynamic - setup pci devices under this PHB
+ * phb: pci-to-host bridge (top-level bridge connecting to cpu)
+ *
+ * This routine is called both during boot, (before the memory
+ * subsystem is set up, before kmalloc is valid) and during the 
+ * dynamic lpar operation of adding a PHB to a running system.
+ */
+void pci_devs_phb_init_dynamic(struct pci_controller *phb)
+{
+	struct device_node *dn = phb->dn;
+	struct pci_dn *pdn;
+
+	/* PHB nodes themselves must not match */
+	pdn = pci_add_device_node_info(phb, dn);
+	if (pdn) {
+		pdn->devfn = pdn->busno = -1;
+		pdn->vendor_id = pdn->device_id = pdn->class_code = 0;
+		pdn->phb = phb;
+		phb->pci_data = pdn;
+	}
+
+	/* Update dn->phb ptrs for new phb and children devices */
+	pci_traverse_device_nodes(dn, add_pdn, phb);
+}
+
+/** 
+ * pci_devs_phb_init - Initialize phbs and pci devs under them.
+ * 
+ * This routine walks over all phb's (pci-host bridges) on the
+ * system, and sets up assorted pci-related structures 
+ * (including pci info in the device node structs) for each
+ * pci device found underneath.  This routine runs once,
+ * early in the boot sequence.
+ */
+static int __init pci_devs_phb_init(void)
+{
+	struct pci_controller *phb, *tmp;
+
+	/* This must be done first so the device nodes have valid pci info! */
+	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+		pci_devs_phb_init_dynamic(phb);
+
+	return 0;
+}
+
+core_initcall(pci_devs_phb_init);
+
+static void pci_dev_pdn_setup(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn;
+
+	if (pdev->dev.archdata.pci_data)
+		return;
+
+	/* Setup the fast path */
+	pdn = pci_get_pdn(pdev);
+	pdev->dev.archdata.pci_data = pdn;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup);
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
new file mode 100644
index 000000000..7765837ba
--- /dev/null
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -0,0 +1,410 @@
+/*
+ * Helper routines to scan the device tree for PCI devices and busses
+ *
+ * Migrated out of PowerPC architecture pci_64.c file by Grant Likely
+ * <grant.likely@secretlab.ca> so that these routines are available for
+ * 32 bit also.
+ *
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *   Rework, based on alpha PCI code.
+ * Copyright (c) 2009 Secret Lab Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+
+/**
+ * get_int_prop - Decode a u32 from a device tree property
+ */
+static u32 get_int_prop(struct device_node *np, const char *name, u32 def)
+{
+	const __be32 *prop;
+	int len;
+
+	prop = of_get_property(np, name, &len);
+	if (prop && len >= 4)
+		return of_read_number(prop, 1);
+	return def;
+}
+
+/**
+ * pci_parse_of_flags - Parse the flags cell of a device tree PCI address
+ * @addr0: value of 1st cell of a device tree PCI address.
+ * @bridge: Set this flag if the address is from a bridge 'ranges' property
+ */
+unsigned int pci_parse_of_flags(u32 addr0, int bridge)
+{
+	unsigned int flags = 0;
+
+	if (addr0 & 0x02000000) {
+		flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
+		flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64;
+		if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64)
+			flags |= IORESOURCE_MEM_64;
+		flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M;
+		if (addr0 & 0x40000000)
+			flags |= IORESOURCE_PREFETCH
+				 | PCI_BASE_ADDRESS_MEM_PREFETCH;
+		/* Note: We don't know whether the ROM has been left enabled
+		 * by the firmware or not. We mark it as disabled (ie, we do
+		 * not set the IORESOURCE_ROM_ENABLE flag) for now rather than
+		 * do a config space read, it will be force-enabled if needed
+		 */
+		if (!bridge && (addr0 & 0xff) == 0x30)
+			flags |= IORESOURCE_READONLY;
+	} else if (addr0 & 0x01000000)
+		flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO;
+	if (flags)
+		flags |= IORESOURCE_SIZEALIGN;
+	return flags;
+}
+
+/**
+ * of_pci_parse_addrs - Parse PCI addresses assigned in the device tree node
+ * @node: device tree node for the PCI device
+ * @dev: pci_dev structure for the device
+ *
+ * This function parses the 'assigned-addresses' property of a PCI devices'
+ * device tree node and writes them into the associated pci_dev structure.
+ */
+static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev)
+{
+	u64 base, size;
+	unsigned int flags;
+	struct pci_bus_region region;
+	struct resource *res;
+	const __be32 *addrs;
+	u32 i;
+	int proplen;
+	bool mark_unset = false;
+
+	addrs = of_get_property(node, "assigned-addresses", &proplen);
+	if (!addrs || !proplen) {
+		addrs = of_get_property(node, "reg", &proplen);
+		if (!addrs || !proplen)
+			return;
+		mark_unset = true;
+	}
+
+	pr_debug("    parse addresses (%d bytes) @ %p\n", proplen, addrs);
+	for (; proplen >= 20; proplen -= 20, addrs += 5) {
+		flags = pci_parse_of_flags(of_read_number(addrs, 1), 0);
+		if (!flags)
+			continue;
+		base = of_read_number(&addrs[1], 2);
+		size = of_read_number(&addrs[3], 2);
+		if (!size)
+			continue;
+		i = of_read_number(addrs, 1) & 0xff;
+		pr_debug("  base: %llx, size: %llx, i: %x\n",
+			 (unsigned long long)base,
+			 (unsigned long long)size, i);
+
+		if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) {
+			res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2];
+		} else if (i == dev->rom_base_reg) {
+			res = &dev->resource[PCI_ROM_RESOURCE];
+			flags |= IORESOURCE_READONLY;
+		} else {
+			printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i);
+			continue;
+		}
+		res->flags = flags;
+		if (mark_unset)
+			res->flags |= IORESOURCE_UNSET;
+		res->name = pci_name(dev);
+		region.start = base;
+		region.end = base + size - 1;
+		pcibios_bus_to_resource(dev->bus, res, &region);
+	}
+}
+
+/**
+ * of_create_pci_dev - Given a device tree node on a pci bus, create a pci_dev
+ * @node: device tree node pointer
+ * @bus: bus the device is sitting on
+ * @devfn: PCI function number, extracted from device tree by caller.
+ */
+struct pci_dev *of_create_pci_dev(struct device_node *node,
+				 struct pci_bus *bus, int devfn)
+{
+	struct pci_dev *dev;
+	const char *type;
+
+	dev = pci_alloc_dev(bus);
+	if (!dev)
+		return NULL;
+	type = of_get_property(node, "device_type", NULL);
+	if (type == NULL)
+		type = "";
+
+	pr_debug("    create device, devfn: %x, type: %s\n", devfn, type);
+
+	dev->dev.of_node = of_node_get(node);
+	dev->dev.parent = bus->bridge;
+	dev->dev.bus = &pci_bus_type;
+	dev->devfn = devfn;
+	dev->multifunction = 0;		/* maybe a lie? */
+	dev->needs_freset = 0;		/* pcie fundamental reset required */
+	set_pcie_port_type(dev);
+
+	pci_dev_assign_slot(dev);
+	dev->vendor = get_int_prop(node, "vendor-id", 0xffff);
+	dev->device = get_int_prop(node, "device-id", 0xffff);
+	dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0);
+	dev->subsystem_device = get_int_prop(node, "subsystem-id", 0);
+
+	dev->cfg_size = pci_cfg_space_size(dev);
+
+	dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(bus),
+		dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
+	dev->class = get_int_prop(node, "class-code", 0);
+	dev->revision = get_int_prop(node, "revision-id", 0);
+
+	pr_debug("    class: 0x%x\n", dev->class);
+	pr_debug("    revision: 0x%x\n", dev->revision);
+
+	dev->current_state = PCI_UNKNOWN;	/* unknown power state */
+	dev->error_state = pci_channel_io_normal;
+	dev->dma_mask = 0xffffffff;
+
+	/* Early fixups, before probing the BARs */
+	pci_fixup_device(pci_fixup_early, dev);
+
+	if (!strcmp(type, "pci") || !strcmp(type, "pciex")) {
+		/* a PCI-PCI bridge */
+		dev->hdr_type = PCI_HEADER_TYPE_BRIDGE;
+		dev->rom_base_reg = PCI_ROM_ADDRESS1;
+		set_pcie_hotplug_bridge(dev);
+	} else if (!strcmp(type, "cardbus")) {
+		dev->hdr_type = PCI_HEADER_TYPE_CARDBUS;
+	} else {
+		dev->hdr_type = PCI_HEADER_TYPE_NORMAL;
+		dev->rom_base_reg = PCI_ROM_ADDRESS;
+		/* Maybe do a default OF mapping here */
+		dev->irq = 0;
+	}
+
+	of_pci_parse_addrs(node, dev);
+
+	pr_debug("    adding to system ...\n");
+
+	pci_device_add(dev, bus);
+
+	return dev;
+}
+EXPORT_SYMBOL(of_create_pci_dev);
+
+/**
+ * of_scan_pci_bridge - Set up a PCI bridge and scan for child nodes
+ * @dev: pci_dev structure for the bridge
+ *
+ * of_scan_bus() calls this routine for each PCI bridge that it finds, and
+ * this routine in turn call of_scan_bus() recusively to scan for more child
+ * devices.
+ */
+void of_scan_pci_bridge(struct pci_dev *dev)
+{
+	struct device_node *node = dev->dev.of_node;
+	struct pci_bus *bus;
+	struct pci_controller *phb;
+	const __be32 *busrange, *ranges;
+	int len, i, mode;
+	struct pci_bus_region region;
+	struct resource *res;
+	unsigned int flags;
+	u64 size;
+
+	pr_debug("of_scan_pci_bridge(%pOF)\n", node);
+
+	/* parse bus-range property */
+	busrange = of_get_property(node, "bus-range", &len);
+	if (busrange == NULL || len != 8) {
+		printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %pOF\n",
+		       node);
+		return;
+	}
+	ranges = of_get_property(node, "ranges", &len);
+	if (ranges == NULL) {
+		printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %pOF\n",
+		       node);
+		return;
+	}
+
+	bus = pci_find_bus(pci_domain_nr(dev->bus),
+			   of_read_number(busrange, 1));
+	if (!bus) {
+		bus = pci_add_new_bus(dev->bus, dev,
+				      of_read_number(busrange, 1));
+		if (!bus) {
+			printk(KERN_ERR "Failed to create pci bus for %pOF\n",
+			       node);
+			return;
+		}
+	}
+
+	bus->primary = dev->bus->number;
+	pci_bus_insert_busn_res(bus, of_read_number(busrange, 1),
+				of_read_number(busrange+1, 1));
+	bus->bridge_ctl = 0;
+
+	/* parse ranges property */
+	/* PCI #address-cells == 3 and #size-cells == 2 always */
+	res = &dev->resource[PCI_BRIDGE_RESOURCES];
+	for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) {
+		res->flags = 0;
+		bus->resource[i] = res;
+		++res;
+	}
+	i = 1;
+	for (; len >= 32; len -= 32, ranges += 8) {
+		flags = pci_parse_of_flags(of_read_number(ranges, 1), 1);
+		size = of_read_number(&ranges[6], 2);
+		if (flags == 0 || size == 0)
+			continue;
+		if (flags & IORESOURCE_IO) {
+			res = bus->resource[0];
+			if (res->flags) {
+				printk(KERN_ERR "PCI: ignoring extra I/O range"
+				       " for bridge %pOF\n", node);
+				continue;
+			}
+		} else {
+			if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) {
+				printk(KERN_ERR "PCI: too many memory ranges"
+				       " for bridge %pOF\n", node);
+				continue;
+			}
+			res = bus->resource[i];
+			++i;
+		}
+		res->flags = flags;
+		region.start = of_read_number(&ranges[1], 2);
+		region.end = region.start + size - 1;
+		pcibios_bus_to_resource(dev->bus, res, &region);
+	}
+	sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus),
+		bus->number);
+	pr_debug("    bus name: %s\n", bus->name);
+
+	phb = pci_bus_to_host(bus);
+
+	mode = PCI_PROBE_NORMAL;
+	if (phb->controller_ops.probe_mode)
+		mode = phb->controller_ops.probe_mode(bus);
+	pr_debug("    probe mode: %d\n", mode);
+
+	if (mode == PCI_PROBE_DEVTREE)
+		of_scan_bus(node, bus);
+	else if (mode == PCI_PROBE_NORMAL)
+		pci_scan_child_bus(bus);
+}
+EXPORT_SYMBOL(of_scan_pci_bridge);
+
+static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus,
+			    struct device_node *dn)
+{
+	struct pci_dev *dev = NULL;
+	const __be32 *reg;
+	int reglen, devfn;
+#ifdef CONFIG_EEH
+	struct eeh_dev *edev = pdn_to_eeh_dev(PCI_DN(dn));
+#endif
+
+	pr_debug("  * %pOF\n", dn);
+	if (!of_device_is_available(dn))
+		return NULL;
+
+	reg = of_get_property(dn, "reg", &reglen);
+	if (reg == NULL || reglen < 20)
+		return NULL;
+	devfn = (of_read_number(reg, 1) >> 8) & 0xff;
+
+	/* Check if the PCI device is already there */
+	dev = pci_get_slot(bus, devfn);
+	if (dev) {
+		pci_dev_put(dev);
+		return dev;
+	}
+
+	/* Device removed permanently ? */
+#ifdef CONFIG_EEH
+	if (edev && (edev->mode & EEH_DEV_REMOVED))
+		return NULL;
+#endif
+
+	/* create a new pci_dev for this device */
+	dev = of_create_pci_dev(dn, bus, devfn);
+	if (!dev)
+		return NULL;
+
+	pr_debug("  dev header type: %x\n", dev->hdr_type);
+	return dev;
+}
+
+/**
+ * __of_scan_bus - given a PCI bus node, setup bus and scan for child devices
+ * @node: device tree node for the PCI bus
+ * @bus: pci_bus structure for the PCI bus
+ * @rescan_existing: Flag indicating bus has already been set up
+ */
+static void __of_scan_bus(struct device_node *node, struct pci_bus *bus,
+			  int rescan_existing)
+{
+	struct device_node *child;
+	struct pci_dev *dev;
+
+	pr_debug("of_scan_bus(%pOF) bus no %d...\n",
+		 node, bus->number);
+
+	/* Scan direct children */
+	for_each_child_of_node(node, child) {
+		dev = of_scan_pci_dev(bus, child);
+		if (!dev)
+			continue;
+		pr_debug("    dev header type: %x\n", dev->hdr_type);
+	}
+
+	/* Apply all fixups necessary. We don't fixup the bus "self"
+	 * for an existing bridge that is being rescanned
+	 */
+	if (!rescan_existing)
+		pcibios_setup_bus_self(bus);
+	pcibios_setup_bus_devices(bus);
+
+	/* Now scan child busses */
+	for_each_pci_bridge(dev, bus)
+		of_scan_pci_bridge(dev);
+}
+
+/**
+ * of_scan_bus - given a PCI bus node, setup bus and scan for child devices
+ * @node: device tree node for the PCI bus
+ * @bus: pci_bus structure for the PCI bus
+ */
+void of_scan_bus(struct device_node *node, struct pci_bus *bus)
+{
+	__of_scan_bus(node, bus, 0);
+}
+EXPORT_SYMBOL_GPL(of_scan_bus);
+
+/**
+ * of_rescan_bus - given a PCI bus node, scan for child devices
+ * @node: device tree node for the PCI bus
+ * @bus: pci_bus structure for the PCI bus
+ *
+ * Same as of_scan_bus, but for a pci_bus structure that has already been
+ * setup.
+ */
+void of_rescan_bus(struct device_node *node, struct pci_bus *bus)
+{
+	__of_scan_bus(node, bus, 1);
+}
+EXPORT_SYMBOL_GPL(of_rescan_bus);
+
diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c
new file mode 100644
index 000000000..58eaa3ddf
--- /dev/null
+++ b/arch/powerpc/kernel/pmc.c
@@ -0,0 +1,102 @@
+/*
+ *  arch/powerpc/kernel/pmc.c
+ *
+ *  Copyright (C) 2004 David Gibson, IBM Corporation.
+ *  Includes code formerly from arch/ppc/kernel/perfmon.c:
+ *    Author: Andy Fleming
+ *    Copyright (c) 2004 Freescale Semiconductor, Inc
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/pmc.h>
+
+#ifndef MMCR0_PMAO
+#define MMCR0_PMAO	0
+#endif
+
+static void dummy_perf(struct pt_regs *regs)
+{
+#if defined(CONFIG_FSL_EMB_PERFMON)
+	mtpmr(PMRN_PMGC0, mfpmr(PMRN_PMGC0) & ~PMGC0_PMIE);
+#elif defined(CONFIG_PPC64) || defined(CONFIG_6xx)
+	if (cur_cpu_spec->pmc_type == PPC_PMC_IBM)
+		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~(MMCR0_PMXE|MMCR0_PMAO));
+#else
+	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMXE);
+#endif
+}
+
+
+static DEFINE_RAW_SPINLOCK(pmc_owner_lock);
+static void *pmc_owner_caller; /* mostly for debugging */
+perf_irq_t perf_irq = dummy_perf;
+
+int reserve_pmc_hardware(perf_irq_t new_perf_irq)
+{
+	int err = 0;
+
+	raw_spin_lock(&pmc_owner_lock);
+
+	if (pmc_owner_caller) {
+		printk(KERN_WARNING "reserve_pmc_hardware: "
+		       "PMC hardware busy (reserved by caller %p)\n",
+		       pmc_owner_caller);
+		err = -EBUSY;
+		goto out;
+	}
+
+	pmc_owner_caller = __builtin_return_address(0);
+	perf_irq = new_perf_irq ? new_perf_irq : dummy_perf;
+
+ out:
+	raw_spin_unlock(&pmc_owner_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(reserve_pmc_hardware);
+
+void release_pmc_hardware(void)
+{
+	raw_spin_lock(&pmc_owner_lock);
+
+	WARN_ON(! pmc_owner_caller);
+
+	pmc_owner_caller = NULL;
+	perf_irq = dummy_perf;
+
+	raw_spin_unlock(&pmc_owner_lock);
+}
+EXPORT_SYMBOL_GPL(release_pmc_hardware);
+
+#ifdef CONFIG_PPC64
+void power4_enable_pmcs(void)
+{
+	unsigned long hid0;
+
+	hid0 = mfspr(SPRN_HID0);
+	hid0 |= 1UL << (63 - 20);
+
+	/* POWER4 requires the following sequence */
+	asm volatile(
+		"sync\n"
+		"mtspr     %1, %0\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"isync" : "=&r" (hid0) : "i" (SPRN_HID0), "0" (hid0):
+		"memory");
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h
new file mode 100644
index 000000000..a27c914d5
--- /dev/null
+++ b/arch/powerpc/kernel/ppc32.h
@@ -0,0 +1,64 @@
+#ifndef _PPC64_PPC32_H
+#define _PPC64_PPC32_H
+
+#include <linux/compat.h>
+#include <asm/siginfo.h>
+#include <asm/signal.h>
+
+/*
+ * Data types and macros for providing 32b PowerPC support.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* These are here to support 32-bit syscalls on a 64-bit kernel. */
+
+struct pt_regs32 {
+	unsigned int gpr[32];
+	unsigned int nip;
+	unsigned int msr;
+	unsigned int orig_gpr3;		/* Used for restarting system calls */
+	unsigned int ctr;
+	unsigned int link;
+	unsigned int xer;
+	unsigned int ccr;
+	unsigned int mq;		/* 601 only (not used at present) */
+	unsigned int trap;		/* Reason for being here */
+	unsigned int dar;		/* Fault registers */
+	unsigned int dsisr;
+	unsigned int result;		/* Result of a system call */
+};
+
+struct sigcontext32 {
+	unsigned int	_unused[4];
+	int		signal;
+	compat_uptr_t	handler;
+	unsigned int	oldmask;
+	compat_uptr_t	regs;  /* 4 byte pointer to the pt_regs32 structure. */
+};
+
+struct mcontext32 {
+	elf_gregset_t32		mc_gregs;
+	elf_fpregset_t		mc_fregs;
+	unsigned int		mc_pad[2];
+	elf_vrregset_t32	mc_vregs __attribute__((__aligned__(16)));
+	elf_vsrreghalf_t32      mc_vsregs __attribute__((__aligned__(16)));
+};
+
+struct ucontext32 { 
+	unsigned int	  	uc_flags;
+	unsigned int 	  	uc_link;
+	compat_stack_t	 	uc_stack;
+	int		 	uc_pad[7];
+	compat_uptr_t		uc_regs;	/* points to uc_mcontext field */
+	compat_sigset_t	 	uc_sigmask;	/* mask last for extensibility */
+	/* glibc has 1024-bit signal masks, ours are 64-bit */
+	int		 	uc_maskext[30];
+	int		 	uc_pad2[3];
+	struct mcontext32	uc_mcontext;
+};
+
+#endif  /* _PPC64_PPC32_H */
diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S
new file mode 100644
index 000000000..6d1b42ee7
--- /dev/null
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * NOTE: assert(sizeof(buf) > 23 * sizeof(long))
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/asm-compat.h>
+
+/*
+ * Grab the register values as they are now.
+ * This won't do a particularly good job because we really
+ * want our caller's caller's registers, and our caller has
+ * already executed its prologue.
+ * ToDo: We could reach back into the caller's save area to do
+ * a better job of representing the caller's state (note that
+ * that will be different for 32-bit and 64-bit, because of the
+ * different ABIs, though).
+ */
+_GLOBAL(ppc_save_regs)
+	PPC_STL	r0,0*SZL(r3)
+#ifdef CONFIG_PPC32
+	stmw	r2, 2*SZL(r3)
+#else
+	PPC_STL	r2,2*SZL(r3)
+	PPC_STL	r3,3*SZL(r3)
+	PPC_STL	r4,4*SZL(r3)
+	PPC_STL	r5,5*SZL(r3)
+	PPC_STL	r6,6*SZL(r3)
+	PPC_STL	r7,7*SZL(r3)
+	PPC_STL	r8,8*SZL(r3)
+	PPC_STL	r9,9*SZL(r3)
+	PPC_STL	r10,10*SZL(r3)
+	PPC_STL	r11,11*SZL(r3)
+	PPC_STL	r12,12*SZL(r3)
+	PPC_STL	r13,13*SZL(r3)
+	PPC_STL	r14,14*SZL(r3)
+	PPC_STL	r15,15*SZL(r3)
+	PPC_STL	r16,16*SZL(r3)
+	PPC_STL	r17,17*SZL(r3)
+	PPC_STL	r18,18*SZL(r3)
+	PPC_STL	r19,19*SZL(r3)
+	PPC_STL	r20,20*SZL(r3)
+	PPC_STL	r21,21*SZL(r3)
+	PPC_STL	r22,22*SZL(r3)
+	PPC_STL	r23,23*SZL(r3)
+	PPC_STL	r24,24*SZL(r3)
+	PPC_STL	r25,25*SZL(r3)
+	PPC_STL	r26,26*SZL(r3)
+	PPC_STL	r27,27*SZL(r3)
+	PPC_STL	r28,28*SZL(r3)
+	PPC_STL	r29,29*SZL(r3)
+	PPC_STL	r30,30*SZL(r3)
+	PPC_STL	r31,31*SZL(r3)
+#endif
+	/* go up one stack frame for SP */
+	PPC_LL	r4,0(r1)
+	PPC_STL	r4,1*SZL(r3)
+	/* get caller's LR */
+	PPC_LL	r0,LRSAVE(r4)
+	PPC_STL	r0,_NIP-STACK_FRAME_OVERHEAD(r3)
+	PPC_STL	r0,_LINK-STACK_FRAME_OVERHEAD(r3)
+	mfmsr	r0
+	PPC_STL	r0,_MSR-STACK_FRAME_OVERHEAD(r3)
+	mfctr	r0
+	PPC_STL	r0,_CTR-STACK_FRAME_OVERHEAD(r3)
+	mfxer	r0
+	PPC_STL	r0,_XER-STACK_FRAME_OVERHEAD(r3)
+	mfcr	r0
+	PPC_STL	r0,_CCR-STACK_FRAME_OVERHEAD(r3)
+	li	r0,0
+	PPC_STL	r0,_TRAP-STACK_FRAME_OVERHEAD(r3)
+	blr
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
new file mode 100644
index 000000000..9bfbd800d
--- /dev/null
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/rtas.h>
+#include <linux/uaccess.h>
+#include <asm/prom.h>
+
+#ifdef CONFIG_PPC64
+
+static loff_t page_map_seek(struct file *file, loff_t off, int whence)
+{
+	return fixed_size_llseek(file, off, whence, PAGE_SIZE);
+}
+
+static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
+			      loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, nbytes, ppos,
+			PDE_DATA(file_inode(file)), PAGE_SIZE);
+}
+
+static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
+{
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE)
+		return -EINVAL;
+
+	remap_pfn_range(vma, vma->vm_start,
+			__pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT,
+			PAGE_SIZE, vma->vm_page_prot);
+	return 0;
+}
+
+static const struct file_operations page_map_fops = {
+	.llseek	= page_map_seek,
+	.read	= page_map_read,
+	.mmap	= page_map_mmap
+};
+
+
+static int __init proc_ppc64_init(void)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_data("powerpc/systemcfg", S_IFREG | 0444, NULL,
+			       &page_map_fops, vdso_data);
+	if (!pde)
+		return 1;
+	proc_set_size(pde, PAGE_SIZE);
+
+	return 0;
+}
+__initcall(proc_ppc64_init);
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Create the ppc64 and ppc64/rtas directories early. This allows us to
+ * assume that they have been previously created in drivers.
+ */
+static int __init proc_ppc64_create(void)
+{
+	struct proc_dir_entry *root;
+
+	root = proc_mkdir("powerpc", NULL);
+	if (!root)
+		return 1;
+
+#ifdef CONFIG_PPC64
+	if (!proc_symlink("ppc64", NULL, "powerpc"))
+		pr_err("Failed to create link /proc/ppc64 -> /proc/powerpc\n");
+#endif
+
+	if (!of_find_node_by_path("/rtas"))
+		return 0;
+
+	if (!proc_mkdir("rtas", root))
+		return 1;
+
+	if (!proc_symlink("rtas", NULL, "powerpc/rtas"))
+		return 1;
+
+	return 0;
+}
+core_initcall(proc_ppc64_create);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
new file mode 100644
index 000000000..56c33285b
--- /dev/null
+++ b/arch/powerpc/kernel/process.c
@@ -0,0 +1,2190 @@
+/*
+ *  Derived from "arch/i386/kernel/process.c"
+ *    Copyright (C) 1995  Linus Torvalds
+ *
+ *  Updated and modified by Cort Dougan (cort@cs.nmt.edu) and
+ *  Paul Mackerras (paulus@cs.anu.edu.au)
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/prctl.h>
+#include <linux/init_task.h>
+#include <linux/export.h>
+#include <linux/kallsyms.h>
+#include <linux/mqueue.h>
+#include <linux/hardirq.h>
+#include <linux/utsname.h>
+#include <linux/ftrace.h>
+#include <linux/kernel_stat.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/elf-randomize.h>
+#include <linux/pkeys.h>
+
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/time.h>
+#include <asm/runlatch.h>
+#include <asm/syscalls.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/debug.h>
+#ifdef CONFIG_PPC64
+#include <asm/firmware.h>
+#include <asm/hw_irq.h>
+#endif
+#include <asm/code-patching.h>
+#include <asm/exec.h>
+#include <asm/livepatch.h>
+#include <asm/cpu_has_feature.h>
+#include <asm/asm-prototypes.h>
+
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+
+/* Transactional Memory debug */
+#ifdef TM_DEBUG_SW
+#define TM_DEBUG(x...) printk(KERN_INFO x)
+#else
+#define TM_DEBUG(x...) do { } while(0)
+#endif
+
+extern unsigned long _get_SP(void);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Are we running in "Suspend disabled" mode? If so we have to block any
+ * sigreturn that would get us into suspended state, and we also warn in some
+ * other paths that we should never reach with suspend disabled.
+ */
+bool tm_suspend_disabled __ro_after_init = false;
+
+static void check_if_tm_restore_required(struct task_struct *tsk)
+{
+	/*
+	 * If we are saving the current thread's registers, and the
+	 * thread is in a transactional state, set the TIF_RESTORE_TM
+	 * bit so that we know to restore the registers before
+	 * returning to userspace.
+	 */
+	if (tsk == current && tsk->thread.regs &&
+	    MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+	    !test_thread_flag(TIF_RESTORE_TM)) {
+		tsk->thread.ckpt_regs.msr = tsk->thread.regs->msr;
+		set_thread_flag(TIF_RESTORE_TM);
+	}
+}
+
+#else
+static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+bool strict_msr_control;
+EXPORT_SYMBOL(strict_msr_control);
+
+static int __init enable_strict_msr_control(char *str)
+{
+	strict_msr_control = true;
+	pr_info("Enabling strict facility control\n");
+
+	return 0;
+}
+early_param("ppc_strict_facility_enable", enable_strict_msr_control);
+
+unsigned long msr_check_and_set(unsigned long bits)
+{
+	unsigned long oldmsr = mfmsr();
+	unsigned long newmsr;
+
+	newmsr = oldmsr | bits;
+
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP))
+		newmsr |= MSR_VSX;
+#endif
+
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+
+	return newmsr;
+}
+EXPORT_SYMBOL_GPL(msr_check_and_set);
+
+void __msr_check_and_clear(unsigned long bits)
+{
+	unsigned long oldmsr = mfmsr();
+	unsigned long newmsr;
+
+	newmsr = oldmsr & ~bits;
+
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX) && (bits & MSR_FP))
+		newmsr &= ~MSR_VSX;
+#endif
+
+	if (oldmsr != newmsr)
+		mtmsr_isync(newmsr);
+}
+EXPORT_SYMBOL(__msr_check_and_clear);
+
+#ifdef CONFIG_PPC_FPU
+static void __giveup_fpu(struct task_struct *tsk)
+{
+	unsigned long msr;
+
+	save_fpu(tsk);
+	msr = tsk->thread.regs->msr;
+	msr &= ~(MSR_FP|MSR_FE0|MSR_FE1);
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		msr &= ~MSR_VSX;
+#endif
+	tsk->thread.regs->msr = msr;
+}
+
+void giveup_fpu(struct task_struct *tsk)
+{
+	check_if_tm_restore_required(tsk);
+
+	msr_check_and_set(MSR_FP);
+	__giveup_fpu(tsk);
+	msr_check_and_clear(MSR_FP);
+}
+EXPORT_SYMBOL(giveup_fpu);
+
+/*
+ * Make sure the floating-point register state in the
+ * the thread_struct is up to date for task tsk.
+ */
+void flush_fp_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		/*
+		 * We need to disable preemption here because if we didn't,
+		 * another process could get scheduled after the regs->msr
+		 * test but before we have finished saving the FP registers
+		 * to the thread_struct.  That process could take over the
+		 * FPU, and then when we get scheduled again we would store
+		 * bogus values for the remaining FP registers.
+		 */
+		preempt_disable();
+		if (tsk->thread.regs->msr & MSR_FP) {
+			/*
+			 * This should only ever be called for current or
+			 * for a stopped child process.  Since we save away
+			 * the FP register state on context switch,
+			 * there is something wrong if a stopped child appears
+			 * to still have its FP state in the CPU registers.
+			 */
+			BUG_ON(tsk != current);
+			giveup_fpu(tsk);
+		}
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(flush_fp_to_thread);
+
+void enable_kernel_fp(void)
+{
+	unsigned long cpumsr;
+
+	WARN_ON(preemptible());
+
+	cpumsr = msr_check_and_set(MSR_FP);
+
+	if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) {
+		check_if_tm_restore_required(current);
+		/*
+		 * If a thread has already been reclaimed then the
+		 * checkpointed registers are on the CPU but have definitely
+		 * been saved by the reclaim code. Don't need to and *cannot*
+		 * giveup as this would save  to the 'live' structure not the
+		 * checkpointed structure.
+		 */
+		if (!MSR_TM_ACTIVE(cpumsr) &&
+		     MSR_TM_ACTIVE(current->thread.regs->msr))
+			return;
+		__giveup_fpu(current);
+	}
+}
+EXPORT_SYMBOL(enable_kernel_fp);
+
+static int restore_fp(struct task_struct *tsk)
+{
+	if (tsk->thread.load_fp) {
+		load_fp_state(&current->thread.fp_state);
+		current->thread.load_fp++;
+		return 1;
+	}
+	return 0;
+}
+#else
+static int restore_fp(struct task_struct *tsk) { return 0; }
+#endif /* CONFIG_PPC_FPU */
+
+#ifdef CONFIG_ALTIVEC
+#define loadvec(thr) ((thr).load_vec)
+
+static void __giveup_altivec(struct task_struct *tsk)
+{
+	unsigned long msr;
+
+	save_altivec(tsk);
+	msr = tsk->thread.regs->msr;
+	msr &= ~MSR_VEC;
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		msr &= ~MSR_VSX;
+#endif
+	tsk->thread.regs->msr = msr;
+}
+
+void giveup_altivec(struct task_struct *tsk)
+{
+	check_if_tm_restore_required(tsk);
+
+	msr_check_and_set(MSR_VEC);
+	__giveup_altivec(tsk);
+	msr_check_and_clear(MSR_VEC);
+}
+EXPORT_SYMBOL(giveup_altivec);
+
+void enable_kernel_altivec(void)
+{
+	unsigned long cpumsr;
+
+	WARN_ON(preemptible());
+
+	cpumsr = msr_check_and_set(MSR_VEC);
+
+	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) {
+		check_if_tm_restore_required(current);
+		/*
+		 * If a thread has already been reclaimed then the
+		 * checkpointed registers are on the CPU but have definitely
+		 * been saved by the reclaim code. Don't need to and *cannot*
+		 * giveup as this would save  to the 'live' structure not the
+		 * checkpointed structure.
+		 */
+		if (!MSR_TM_ACTIVE(cpumsr) &&
+		     MSR_TM_ACTIVE(current->thread.regs->msr))
+			return;
+		__giveup_altivec(current);
+	}
+}
+EXPORT_SYMBOL(enable_kernel_altivec);
+
+/*
+ * Make sure the VMX/Altivec register state in the
+ * the thread_struct is up to date for task tsk.
+ */
+void flush_altivec_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		preempt_disable();
+		if (tsk->thread.regs->msr & MSR_VEC) {
+			BUG_ON(tsk != current);
+			giveup_altivec(tsk);
+		}
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
+
+static int restore_altivec(struct task_struct *tsk)
+{
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) {
+		load_vr_state(&tsk->thread.vr_state);
+		tsk->thread.used_vr = 1;
+		tsk->thread.load_vec++;
+
+		return 1;
+	}
+	return 0;
+}
+#else
+#define loadvec(thr) 0
+static inline int restore_altivec(struct task_struct *tsk) { return 0; }
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_VSX
+static void __giveup_vsx(struct task_struct *tsk)
+{
+	unsigned long msr = tsk->thread.regs->msr;
+
+	/*
+	 * We should never be ssetting MSR_VSX without also setting
+	 * MSR_FP and MSR_VEC
+	 */
+	WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC)));
+
+	/* __giveup_fpu will clear MSR_VSX */
+	if (msr & MSR_FP)
+		__giveup_fpu(tsk);
+	if (msr & MSR_VEC)
+		__giveup_altivec(tsk);
+}
+
+static void giveup_vsx(struct task_struct *tsk)
+{
+	check_if_tm_restore_required(tsk);
+
+	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
+	__giveup_vsx(tsk);
+	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
+}
+
+void enable_kernel_vsx(void)
+{
+	unsigned long cpumsr;
+
+	WARN_ON(preemptible());
+
+	cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
+
+	if (current->thread.regs &&
+	    (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) {
+		check_if_tm_restore_required(current);
+		/*
+		 * If a thread has already been reclaimed then the
+		 * checkpointed registers are on the CPU but have definitely
+		 * been saved by the reclaim code. Don't need to and *cannot*
+		 * giveup as this would save  to the 'live' structure not the
+		 * checkpointed structure.
+		 */
+		if (!MSR_TM_ACTIVE(cpumsr) &&
+		     MSR_TM_ACTIVE(current->thread.regs->msr))
+			return;
+		__giveup_vsx(current);
+	}
+}
+EXPORT_SYMBOL(enable_kernel_vsx);
+
+void flush_vsx_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		preempt_disable();
+		if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) {
+			BUG_ON(tsk != current);
+			giveup_vsx(tsk);
+		}
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
+
+static int restore_vsx(struct task_struct *tsk)
+{
+	if (cpu_has_feature(CPU_FTR_VSX)) {
+		tsk->thread.used_vsr = 1;
+		return 1;
+	}
+
+	return 0;
+}
+#else
+static inline int restore_vsx(struct task_struct *tsk) { return 0; }
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_SPE
+void giveup_spe(struct task_struct *tsk)
+{
+	check_if_tm_restore_required(tsk);
+
+	msr_check_and_set(MSR_SPE);
+	__giveup_spe(tsk);
+	msr_check_and_clear(MSR_SPE);
+}
+EXPORT_SYMBOL(giveup_spe);
+
+void enable_kernel_spe(void)
+{
+	WARN_ON(preemptible());
+
+	msr_check_and_set(MSR_SPE);
+
+	if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) {
+		check_if_tm_restore_required(current);
+		__giveup_spe(current);
+	}
+}
+EXPORT_SYMBOL(enable_kernel_spe);
+
+void flush_spe_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		preempt_disable();
+		if (tsk->thread.regs->msr & MSR_SPE) {
+			BUG_ON(tsk != current);
+			tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
+			giveup_spe(tsk);
+		}
+		preempt_enable();
+	}
+}
+#endif /* CONFIG_SPE */
+
+static unsigned long msr_all_available;
+
+static int __init init_msr_all_available(void)
+{
+#ifdef CONFIG_PPC_FPU
+	msr_all_available |= MSR_FP;
+#endif
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		msr_all_available |= MSR_VEC;
+#endif
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		msr_all_available |= MSR_VSX;
+#endif
+#ifdef CONFIG_SPE
+	if (cpu_has_feature(CPU_FTR_SPE))
+		msr_all_available |= MSR_SPE;
+#endif
+
+	return 0;
+}
+early_initcall(init_msr_all_available);
+
+void giveup_all(struct task_struct *tsk)
+{
+	unsigned long usermsr;
+
+	if (!tsk->thread.regs)
+		return;
+
+	check_if_tm_restore_required(tsk);
+
+	usermsr = tsk->thread.regs->msr;
+
+	if ((usermsr & msr_all_available) == 0)
+		return;
+
+	msr_check_and_set(msr_all_available);
+
+	WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC)));
+
+#ifdef CONFIG_PPC_FPU
+	if (usermsr & MSR_FP)
+		__giveup_fpu(tsk);
+#endif
+#ifdef CONFIG_ALTIVEC
+	if (usermsr & MSR_VEC)
+		__giveup_altivec(tsk);
+#endif
+#ifdef CONFIG_SPE
+	if (usermsr & MSR_SPE)
+		__giveup_spe(tsk);
+#endif
+
+	msr_check_and_clear(msr_all_available);
+}
+EXPORT_SYMBOL(giveup_all);
+
+void restore_math(struct pt_regs *regs)
+{
+	unsigned long msr;
+
+	if (!MSR_TM_ACTIVE(regs->msr) &&
+		!current->thread.load_fp && !loadvec(current->thread))
+		return;
+
+	msr = regs->msr;
+	msr_check_and_set(msr_all_available);
+
+	/*
+	 * Only reload if the bit is not set in the user MSR, the bit BEING set
+	 * indicates that the registers are hot
+	 */
+	if ((!(msr & MSR_FP)) && restore_fp(current))
+		msr |= MSR_FP | current->thread.fpexc_mode;
+
+	if ((!(msr & MSR_VEC)) && restore_altivec(current))
+		msr |= MSR_VEC;
+
+	if ((msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC) &&
+			restore_vsx(current)) {
+		msr |= MSR_VSX;
+	}
+
+	msr_check_and_clear(msr_all_available);
+
+	regs->msr = msr;
+}
+
+static void save_all(struct task_struct *tsk)
+{
+	unsigned long usermsr;
+
+	if (!tsk->thread.regs)
+		return;
+
+	usermsr = tsk->thread.regs->msr;
+
+	if ((usermsr & msr_all_available) == 0)
+		return;
+
+	msr_check_and_set(msr_all_available);
+
+	WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC)));
+
+	if (usermsr & MSR_FP)
+		save_fpu(tsk);
+
+	if (usermsr & MSR_VEC)
+		save_altivec(tsk);
+
+	if (usermsr & MSR_SPE)
+		__giveup_spe(tsk);
+
+	msr_check_and_clear(msr_all_available);
+	thread_pkey_regs_save(&tsk->thread);
+}
+
+void flush_all_to_thread(struct task_struct *tsk)
+{
+	if (tsk->thread.regs) {
+		preempt_disable();
+		BUG_ON(tsk != current);
+#ifdef CONFIG_SPE
+		if (tsk->thread.regs->msr & MSR_SPE)
+			tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
+#endif
+		save_all(tsk);
+
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL(flush_all_to_thread);
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+void do_send_trap(struct pt_regs *regs, unsigned long address,
+		  unsigned long error_code, int breakpt)
+{
+	current->thread.trap_nr = TRAP_HWBKPT;
+	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+			11, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	/* Deliver the signal to userspace */
+	force_sig_ptrace_errno_trap(breakpt, /* breakpoint or watchpoint id */
+				    (void __user *)address);
+}
+#else	/* !CONFIG_PPC_ADV_DEBUG_REGS */
+void do_break (struct pt_regs *regs, unsigned long address,
+		    unsigned long error_code)
+{
+	siginfo_t info;
+
+	current->thread.trap_nr = TRAP_HWBKPT;
+	if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+			11, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (debugger_break_match(regs))
+		return;
+
+	/* Clear the breakpoint */
+	hw_breakpoint_disable();
+
+	/* Deliver the signal to userspace */
+	clear_siginfo(&info);
+	info.si_signo = SIGTRAP;
+	info.si_errno = 0;
+	info.si_code = TRAP_HWBKPT;
+	info.si_addr = (void __user *)address;
+	force_sig_info(SIGTRAP, &info, current);
+}
+#endif	/* CONFIG_PPC_ADV_DEBUG_REGS */
+
+static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk);
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+/*
+ * Set the debug registers back to their default "safe" values.
+ */
+static void set_debug_reg_defaults(struct thread_struct *thread)
+{
+	thread->debug.iac1 = thread->debug.iac2 = 0;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	thread->debug.iac3 = thread->debug.iac4 = 0;
+#endif
+	thread->debug.dac1 = thread->debug.dac2 = 0;
+#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
+	thread->debug.dvc1 = thread->debug.dvc2 = 0;
+#endif
+	thread->debug.dbcr0 = 0;
+#ifdef CONFIG_BOOKE
+	/*
+	 * Force User/Supervisor bits to b11 (user-only MSR[PR]=1)
+	 */
+	thread->debug.dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US |
+			DBCR1_IAC3US | DBCR1_IAC4US;
+	/*
+	 * Force Data Address Compare User/Supervisor bits to be User-only
+	 * (0b11 MSR[PR]=1) and set all other bits in DBCR2 register to be 0.
+	 */
+	thread->debug.dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US;
+#else
+	thread->debug.dbcr1 = 0;
+#endif
+}
+
+static void prime_debug_regs(struct debug_reg *debug)
+{
+	/*
+	 * We could have inherited MSR_DE from userspace, since
+	 * it doesn't get cleared on exception entry.  Make sure
+	 * MSR_DE is clear before we enable any debug events.
+	 */
+	mtmsr(mfmsr() & ~MSR_DE);
+
+	mtspr(SPRN_IAC1, debug->iac1);
+	mtspr(SPRN_IAC2, debug->iac2);
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	mtspr(SPRN_IAC3, debug->iac3);
+	mtspr(SPRN_IAC4, debug->iac4);
+#endif
+	mtspr(SPRN_DAC1, debug->dac1);
+	mtspr(SPRN_DAC2, debug->dac2);
+#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
+	mtspr(SPRN_DVC1, debug->dvc1);
+	mtspr(SPRN_DVC2, debug->dvc2);
+#endif
+	mtspr(SPRN_DBCR0, debug->dbcr0);
+	mtspr(SPRN_DBCR1, debug->dbcr1);
+#ifdef CONFIG_BOOKE
+	mtspr(SPRN_DBCR2, debug->dbcr2);
+#endif
+}
+/*
+ * Unless neither the old or new thread are making use of the
+ * debug registers, set the debug registers from the values
+ * stored in the new thread.
+ */
+void switch_booke_debug_regs(struct debug_reg *new_debug)
+{
+	if ((current->thread.debug.dbcr0 & DBCR0_IDM)
+		|| (new_debug->dbcr0 & DBCR0_IDM))
+			prime_debug_regs(new_debug);
+}
+EXPORT_SYMBOL_GPL(switch_booke_debug_regs);
+#else	/* !CONFIG_PPC_ADV_DEBUG_REGS */
+#ifndef CONFIG_HAVE_HW_BREAKPOINT
+static void set_breakpoint(struct arch_hw_breakpoint *brk)
+{
+	preempt_disable();
+	__set_breakpoint(brk);
+	preempt_enable();
+}
+
+static void set_debug_reg_defaults(struct thread_struct *thread)
+{
+	thread->hw_brk.address = 0;
+	thread->hw_brk.type = 0;
+	if (ppc_breakpoint_available())
+		set_breakpoint(&thread->hw_brk);
+}
+#endif /* !CONFIG_HAVE_HW_BREAKPOINT */
+#endif	/* CONFIG_PPC_ADV_DEBUG_REGS */
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+	mtspr(SPRN_DAC1, dabr);
+#ifdef CONFIG_PPC_47x
+	isync();
+#endif
+	return 0;
+}
+#elif defined(CONFIG_PPC_BOOK3S)
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+	mtspr(SPRN_DABR, dabr);
+	if (cpu_has_feature(CPU_FTR_DABRX))
+		mtspr(SPRN_DABRX, dabrx);
+	return 0;
+}
+#elif defined(CONFIG_PPC_8xx)
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+	unsigned long addr = dabr & ~HW_BRK_TYPE_DABR;
+	unsigned long lctrl1 = 0x90000000; /* compare type: equal on E & F */
+	unsigned long lctrl2 = 0x8e000002; /* watchpoint 1 on cmp E | F */
+
+	if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ)
+		lctrl1 |= 0xa0000;
+	else if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE)
+		lctrl1 |= 0xf0000;
+	else if ((dabr & HW_BRK_TYPE_RDWR) == 0)
+		lctrl2 = 0;
+
+	mtspr(SPRN_LCTRL2, 0);
+	mtspr(SPRN_CMPE, addr);
+	mtspr(SPRN_CMPF, addr + 4);
+	mtspr(SPRN_LCTRL1, lctrl1);
+	mtspr(SPRN_LCTRL2, lctrl2);
+
+	return 0;
+}
+#else
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+	return -EINVAL;
+}
+#endif
+
+static inline int set_dabr(struct arch_hw_breakpoint *brk)
+{
+	unsigned long dabr, dabrx;
+
+	dabr = brk->address | (brk->type & HW_BRK_TYPE_DABR);
+	dabrx = ((brk->type >> 3) & 0x7);
+
+	if (ppc_md.set_dabr)
+		return ppc_md.set_dabr(dabr, dabrx);
+
+	return __set_dabr(dabr, dabrx);
+}
+
+static inline int set_dawr(struct arch_hw_breakpoint *brk)
+{
+	unsigned long dawr, dawrx, mrd;
+
+	dawr = brk->address;
+
+	dawrx  = (brk->type & (HW_BRK_TYPE_READ | HW_BRK_TYPE_WRITE)) \
+		                   << (63 - 58); //* read/write bits */
+	dawrx |= ((brk->type & (HW_BRK_TYPE_TRANSLATE)) >> 2) \
+		                   << (63 - 59); //* translate */
+	dawrx |= (brk->type & (HW_BRK_TYPE_PRIV_ALL)) \
+		                   >> 3; //* PRIM bits */
+	/* dawr length is stored in field MDR bits 48:53.  Matches range in
+	   doublewords (64 bits) baised by -1 eg. 0b000000=1DW and
+	   0b111111=64DW.
+	   brk->len is in bytes.
+	   This aligns up to double word size, shifts and does the bias.
+	*/
+	mrd = ((brk->len + 7) >> 3) - 1;
+	dawrx |= (mrd & 0x3f) << (63 - 53);
+
+	if (ppc_md.set_dawr)
+		return ppc_md.set_dawr(dawr, dawrx);
+	mtspr(SPRN_DAWR, dawr);
+	mtspr(SPRN_DAWRX, dawrx);
+	return 0;
+}
+
+void __set_breakpoint(struct arch_hw_breakpoint *brk)
+{
+	memcpy(this_cpu_ptr(&current_brk), brk, sizeof(*brk));
+
+	if (cpu_has_feature(CPU_FTR_DAWR))
+		// Power8 or later
+		set_dawr(brk);
+	else if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		// Power7 or earlier
+		set_dabr(brk);
+	else
+		// Shouldn't happen due to higher level checks
+		WARN_ON_ONCE(1);
+}
+
+/* Check if we have DAWR or DABR hardware */
+bool ppc_breakpoint_available(void)
+{
+	if (cpu_has_feature(CPU_FTR_DAWR))
+		return true; /* POWER8 DAWR */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		return false; /* POWER9 with DAWR disabled */
+	/* DABR: Everything but POWER8 and POWER9 */
+	return true;
+}
+EXPORT_SYMBOL_GPL(ppc_breakpoint_available);
+
+static inline bool hw_brk_match(struct arch_hw_breakpoint *a,
+			      struct arch_hw_breakpoint *b)
+{
+	if (a->address != b->address)
+		return false;
+	if (a->type != b->type)
+		return false;
+	if (a->len != b->len)
+		return false;
+	return true;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+
+static inline bool tm_enabled(struct task_struct *tsk)
+{
+	return tsk && tsk->thread.regs && (tsk->thread.regs->msr & MSR_TM);
+}
+
+static void tm_reclaim_thread(struct thread_struct *thr, uint8_t cause)
+{
+	/*
+	 * Use the current MSR TM suspended bit to track if we have
+	 * checkpointed state outstanding.
+	 * On signal delivery, we'd normally reclaim the checkpointed
+	 * state to obtain stack pointer (see:get_tm_stackpointer()).
+	 * This will then directly return to userspace without going
+	 * through __switch_to(). However, if the stack frame is bad,
+	 * we need to exit this thread which calls __switch_to() which
+	 * will again attempt to reclaim the already saved tm state.
+	 * Hence we need to check that we've not already reclaimed
+	 * this state.
+	 * We do this using the current MSR, rather tracking it in
+	 * some specific thread_struct bit, as it has the additional
+	 * benefit of checking for a potential TM bad thing exception.
+	 */
+	if (!MSR_TM_SUSPENDED(mfmsr()))
+		return;
+
+	giveup_all(container_of(thr, struct task_struct, thread));
+
+	tm_reclaim(thr, cause);
+
+	/*
+	 * If we are in a transaction and FP is off then we can't have
+	 * used FP inside that transaction. Hence the checkpointed
+	 * state is the same as the live state. We need to copy the
+	 * live state to the checkpointed state so that when the
+	 * transaction is restored, the checkpointed state is correct
+	 * and the aborted transaction sees the correct state. We use
+	 * ckpt_regs.msr here as that's what tm_reclaim will use to
+	 * determine if it's going to write the checkpointed state or
+	 * not. So either this will write the checkpointed registers,
+	 * or reclaim will. Similarly for VMX.
+	 */
+	if ((thr->ckpt_regs.msr & MSR_FP) == 0)
+		memcpy(&thr->ckfp_state, &thr->fp_state,
+		       sizeof(struct thread_fp_state));
+	if ((thr->ckpt_regs.msr & MSR_VEC) == 0)
+		memcpy(&thr->ckvr_state, &thr->vr_state,
+		       sizeof(struct thread_vr_state));
+}
+
+void tm_reclaim_current(uint8_t cause)
+{
+	tm_enable();
+	tm_reclaim_thread(&current->thread, cause);
+}
+
+static inline void tm_reclaim_task(struct task_struct *tsk)
+{
+	/* We have to work out if we're switching from/to a task that's in the
+	 * middle of a transaction.
+	 *
+	 * In switching we need to maintain a 2nd register state as
+	 * oldtask->thread.ckpt_regs.  We tm_reclaim(oldproc); this saves the
+	 * checkpointed (tbegin) state in ckpt_regs, ckfp_state and
+	 * ckvr_state
+	 *
+	 * We also context switch (save) TFHAR/TEXASR/TFIAR in here.
+	 */
+	struct thread_struct *thr = &tsk->thread;
+
+	if (!thr->regs)
+		return;
+
+	if (!MSR_TM_ACTIVE(thr->regs->msr))
+		goto out_and_saveregs;
+
+	WARN_ON(tm_suspend_disabled);
+
+	TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
+		 "ccr=%lx, msr=%lx, trap=%lx)\n",
+		 tsk->pid, thr->regs->nip,
+		 thr->regs->ccr, thr->regs->msr,
+		 thr->regs->trap);
+
+	tm_reclaim_thread(thr, TM_CAUSE_RESCHED);
+
+	TM_DEBUG("--- tm_reclaim on pid %d complete\n",
+		 tsk->pid);
+
+out_and_saveregs:
+	/* Always save the regs here, even if a transaction's not active.
+	 * This context-switches a thread's TM info SPRs.  We do it here to
+	 * be consistent with the restore path (in recheckpoint) which
+	 * cannot happen later in _switch().
+	 */
+	tm_save_sprs(thr);
+}
+
+extern void __tm_recheckpoint(struct thread_struct *thread);
+
+void tm_recheckpoint(struct thread_struct *thread)
+{
+	unsigned long flags;
+
+	if (!(thread->regs->msr & MSR_TM))
+		return;
+
+	/* We really can't be interrupted here as the TEXASR registers can't
+	 * change and later in the trecheckpoint code, we have a userspace R1.
+	 * So let's hard disable over this region.
+	 */
+	local_irq_save(flags);
+	hard_irq_disable();
+
+	/* The TM SPRs are restored here, so that TEXASR.FS can be set
+	 * before the trecheckpoint and no explosion occurs.
+	 */
+	tm_restore_sprs(thread);
+
+	__tm_recheckpoint(thread);
+
+	local_irq_restore(flags);
+}
+
+static inline void tm_recheckpoint_new_task(struct task_struct *new)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return;
+
+	/* Recheckpoint the registers of the thread we're about to switch to.
+	 *
+	 * If the task was using FP, we non-lazily reload both the original and
+	 * the speculative FP register states.  This is because the kernel
+	 * doesn't see if/when a TM rollback occurs, so if we take an FP
+	 * unavailable later, we are unable to determine which set of FP regs
+	 * need to be restored.
+	 */
+	if (!tm_enabled(new))
+		return;
+
+	if (!MSR_TM_ACTIVE(new->thread.regs->msr)){
+		tm_restore_sprs(&new->thread);
+		return;
+	}
+	/* Recheckpoint to restore original checkpointed register state. */
+	TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n",
+		 new->pid, new->thread.regs->msr);
+
+	tm_recheckpoint(&new->thread);
+
+	/*
+	 * The checkpointed state has been restored but the live state has
+	 * not, ensure all the math functionality is turned off to trigger
+	 * restore_math() to reload.
+	 */
+	new->thread.regs->msr &= ~(MSR_FP | MSR_VEC | MSR_VSX);
+
+	TM_DEBUG("*** tm_recheckpoint of pid %d complete "
+		 "(kernel msr 0x%lx)\n",
+		 new->pid, mfmsr());
+}
+
+static inline void __switch_to_tm(struct task_struct *prev,
+		struct task_struct *new)
+{
+	if (cpu_has_feature(CPU_FTR_TM)) {
+		if (tm_enabled(prev) || tm_enabled(new))
+			tm_enable();
+
+		if (tm_enabled(prev)) {
+			prev->thread.load_tm++;
+			tm_reclaim_task(prev);
+			if (!MSR_TM_ACTIVE(prev->thread.regs->msr) && prev->thread.load_tm == 0)
+				prev->thread.regs->msr &= ~MSR_TM;
+		}
+
+		tm_recheckpoint_new_task(new);
+	}
+}
+
+/*
+ * This is called if we are on the way out to userspace and the
+ * TIF_RESTORE_TM flag is set.  It checks if we need to reload
+ * FP and/or vector state and does so if necessary.
+ * If userspace is inside a transaction (whether active or
+ * suspended) and FP/VMX/VSX instructions have ever been enabled
+ * inside that transaction, then we have to keep them enabled
+ * and keep the FP/VMX/VSX state loaded while ever the transaction
+ * continues.  The reason is that if we didn't, and subsequently
+ * got a FP/VMX/VSX unavailable interrupt inside a transaction,
+ * we don't know whether it's the same transaction, and thus we
+ * don't know which of the checkpointed state and the transactional
+ * state to use.
+ */
+void restore_tm_state(struct pt_regs *regs)
+{
+	unsigned long msr_diff;
+
+	/*
+	 * This is the only moment we should clear TIF_RESTORE_TM as
+	 * it is here that ckpt_regs.msr and pt_regs.msr become the same
+	 * again, anything else could lead to an incorrect ckpt_msr being
+	 * saved and therefore incorrect signal contexts.
+	 */
+	clear_thread_flag(TIF_RESTORE_TM);
+	if (!MSR_TM_ACTIVE(regs->msr))
+		return;
+
+	msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
+	msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
+
+	/* Ensure that restore_math() will restore */
+	if (msr_diff & MSR_FP)
+		current->thread.load_fp = 1;
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) && msr_diff & MSR_VEC)
+		current->thread.load_vec = 1;
+#endif
+	restore_math(regs);
+
+	regs->msr |= msr_diff;
+}
+
+#else
+#define tm_recheckpoint_new_task(new)
+#define __switch_to_tm(prev, new)
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+static inline void save_sprs(struct thread_struct *t)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		t->vrsave = mfspr(SPRN_VRSAVE);
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (cpu_has_feature(CPU_FTR_DSCR))
+		t->dscr = mfspr(SPRN_DSCR);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		t->bescr = mfspr(SPRN_BESCR);
+		t->ebbhr = mfspr(SPRN_EBBHR);
+		t->ebbrr = mfspr(SPRN_EBBRR);
+
+		t->fscr = mfspr(SPRN_FSCR);
+
+		/*
+		 * Note that the TAR is not available for use in the kernel.
+		 * (To provide this, the TAR should be backed up/restored on
+		 * exception entry/exit instead, and be in pt_regs.  FIXME,
+		 * this should be in pt_regs anyway (for debug).)
+		 */
+		t->tar = mfspr(SPRN_TAR);
+	}
+#endif
+
+	thread_pkey_regs_save(t);
+}
+
+static inline void restore_sprs(struct thread_struct *old_thread,
+				struct thread_struct *new_thread)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
+	    old_thread->vrsave != new_thread->vrsave)
+		mtspr(SPRN_VRSAVE, new_thread->vrsave);
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (cpu_has_feature(CPU_FTR_DSCR)) {
+		u64 dscr = get_paca()->dscr_default;
+		if (new_thread->dscr_inherit)
+			dscr = new_thread->dscr;
+
+		if (old_thread->dscr != dscr)
+			mtspr(SPRN_DSCR, dscr);
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		if (old_thread->bescr != new_thread->bescr)
+			mtspr(SPRN_BESCR, new_thread->bescr);
+		if (old_thread->ebbhr != new_thread->ebbhr)
+			mtspr(SPRN_EBBHR, new_thread->ebbhr);
+		if (old_thread->ebbrr != new_thread->ebbrr)
+			mtspr(SPRN_EBBRR, new_thread->ebbrr);
+
+		if (old_thread->fscr != new_thread->fscr)
+			mtspr(SPRN_FSCR, new_thread->fscr);
+
+		if (old_thread->tar != new_thread->tar)
+			mtspr(SPRN_TAR, new_thread->tar);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TIDR) &&
+	    old_thread->tidr != new_thread->tidr)
+		mtspr(SPRN_TIDR, new_thread->tidr);
+#endif
+
+	thread_pkey_regs_restore(new_thread, old_thread);
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#define CP_SIZE 128
+static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE)));
+#endif
+
+struct task_struct *__switch_to(struct task_struct *prev,
+	struct task_struct *new)
+{
+	struct thread_struct *new_thread, *old_thread;
+	struct task_struct *last;
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct ppc64_tlb_batch *batch;
+#endif
+
+	new_thread = &new->thread;
+	old_thread = &current->thread;
+
+	WARN_ON(!irqs_disabled());
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	batch = this_cpu_ptr(&ppc64_tlb_batch);
+	if (batch->active) {
+		current_thread_info()->local_flags |= _TLF_LAZY_MMU;
+		if (batch->index)
+			__flush_tlb_pending(batch);
+		batch->active = 0;
+	}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	switch_booke_debug_regs(&new->thread.debug);
+#else
+/*
+ * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would
+ * schedule DABR
+ */
+#ifndef CONFIG_HAVE_HW_BREAKPOINT
+	if (unlikely(!hw_brk_match(this_cpu_ptr(&current_brk), &new->thread.hw_brk)))
+		__set_breakpoint(&new->thread.hw_brk);
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#endif
+
+	/*
+	 * We need to save SPRs before treclaim/trecheckpoint as these will
+	 * change a number of them.
+	 */
+	save_sprs(&prev->thread);
+
+	/* Save FPU, Altivec, VSX and SPE state */
+	giveup_all(prev);
+
+	__switch_to_tm(prev, new);
+
+	if (!radix_enabled()) {
+		/*
+		 * We can't take a PMU exception inside _switch() since there
+		 * is a window where the kernel stack SLB and the kernel stack
+		 * are out of sync. Hard disable here.
+		 */
+		hard_irq_disable();
+	}
+
+	/*
+	 * Call restore_sprs() before calling _switch(). If we move it after
+	 * _switch() then we miss out on calling it for new tasks. The reason
+	 * for this is we manually create a stack frame for new tasks that
+	 * directly returns through ret_from_fork() or
+	 * ret_from_kernel_thread(). See copy_thread() for details.
+	 */
+	restore_sprs(old_thread, new_thread);
+
+	last = _switch(old_thread, new_thread);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
+		current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
+		batch = this_cpu_ptr(&ppc64_tlb_batch);
+		batch->active = 1;
+	}
+
+	if (current_thread_info()->task->thread.regs) {
+		restore_math(current_thread_info()->task->thread.regs);
+
+		/*
+		 * The copy-paste buffer can only store into foreign real
+		 * addresses, so unprivileged processes can not see the
+		 * data or use it in any way unless they have foreign real
+		 * mappings. If the new process has the foreign real address
+		 * mappings, we must issue a cp_abort to clear any state and
+		 * prevent snooping, corruption or a covert channel.
+		 */
+		if (current_thread_info()->task->thread.used_vas)
+			asm volatile(PPC_CP_ABORT);
+	}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	return last;
+}
+
+static int instructions_to_print = 16;
+
+static void show_instructions(struct pt_regs *regs)
+{
+	int i;
+	unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
+			sizeof(int));
+
+	printk("Instruction dump:");
+
+	for (i = 0; i < instructions_to_print; i++) {
+		int instr;
+
+		if (!(i % 8))
+			pr_cont("\n");
+
+#if !defined(CONFIG_BOOKE)
+		/* If executing with the IMMU off, adjust pc rather
+		 * than print XXXXXXXX.
+		 */
+		if (!(regs->msr & MSR_IR))
+			pc = (unsigned long)phys_to_virt(pc);
+#endif
+
+		if (!__kernel_text_address(pc) ||
+		     probe_kernel_address((unsigned int __user *)pc, instr)) {
+			pr_cont("XXXXXXXX ");
+		} else {
+			if (regs->nip == pc)
+				pr_cont("<%08x> ", instr);
+			else
+				pr_cont("%08x ", instr);
+		}
+
+		pc += sizeof(int);
+	}
+
+	pr_cont("\n");
+}
+
+void show_user_instructions(struct pt_regs *regs)
+{
+	unsigned long pc;
+	int i;
+
+	pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int));
+
+	/*
+	 * Make sure the NIP points at userspace, not kernel text/data or
+	 * elsewhere.
+	 */
+	if (!__access_ok(pc, instructions_to_print * sizeof(int), USER_DS)) {
+		pr_info("%s[%d]: Bad NIP, not dumping instructions.\n",
+			current->comm, current->pid);
+		return;
+	}
+
+	pr_info("%s[%d]: code: ", current->comm, current->pid);
+
+	for (i = 0; i < instructions_to_print; i++) {
+		int instr;
+
+		if (!(i % 8) && (i > 0)) {
+			pr_cont("\n");
+			pr_info("%s[%d]: code: ", current->comm, current->pid);
+		}
+
+		if (probe_kernel_address((unsigned int __user *)pc, instr)) {
+			pr_cont("XXXXXXXX ");
+		} else {
+			if (regs->nip == pc)
+				pr_cont("<%08x> ", instr);
+			else
+				pr_cont("%08x ", instr);
+		}
+
+		pc += sizeof(int);
+	}
+
+	pr_cont("\n");
+}
+
+struct regbit {
+	unsigned long bit;
+	const char *name;
+};
+
+static struct regbit msr_bits[] = {
+#if defined(CONFIG_PPC64) && !defined(CONFIG_BOOKE)
+	{MSR_SF,	"SF"},
+	{MSR_HV,	"HV"},
+#endif
+	{MSR_VEC,	"VEC"},
+	{MSR_VSX,	"VSX"},
+#ifdef CONFIG_BOOKE
+	{MSR_CE,	"CE"},
+#endif
+	{MSR_EE,	"EE"},
+	{MSR_PR,	"PR"},
+	{MSR_FP,	"FP"},
+	{MSR_ME,	"ME"},
+#ifdef CONFIG_BOOKE
+	{MSR_DE,	"DE"},
+#else
+	{MSR_SE,	"SE"},
+	{MSR_BE,	"BE"},
+#endif
+	{MSR_IR,	"IR"},
+	{MSR_DR,	"DR"},
+	{MSR_PMM,	"PMM"},
+#ifndef CONFIG_BOOKE
+	{MSR_RI,	"RI"},
+	{MSR_LE,	"LE"},
+#endif
+	{0,		NULL}
+};
+
+static void print_bits(unsigned long val, struct regbit *bits, const char *sep)
+{
+	const char *s = "";
+
+	for (; bits->bit; ++bits)
+		if (val & bits->bit) {
+			pr_cont("%s%s", s, bits->name);
+			s = sep;
+		}
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static struct regbit msr_tm_bits[] = {
+	{MSR_TS_T,	"T"},
+	{MSR_TS_S,	"S"},
+	{MSR_TM,	"E"},
+	{0,		NULL}
+};
+
+static void print_tm_bits(unsigned long val)
+{
+/*
+ * This only prints something if at least one of the TM bit is set.
+ * Inside the TM[], the output means:
+ *   E: Enabled		(bit 32)
+ *   S: Suspended	(bit 33)
+ *   T: Transactional	(bit 34)
+ */
+	if (val & (MSR_TM | MSR_TS_S | MSR_TS_T)) {
+		pr_cont(",TM[");
+		print_bits(val, msr_tm_bits, "");
+		pr_cont("]");
+	}
+}
+#else
+static void print_tm_bits(unsigned long val) {}
+#endif
+
+static void print_msr_bits(unsigned long val)
+{
+	pr_cont("<");
+	print_bits(val, msr_bits, ",");
+	print_tm_bits(val);
+	pr_cont(">");
+}
+
+#ifdef CONFIG_PPC64
+#define REG		"%016lx"
+#define REGS_PER_LINE	4
+#define LAST_VOLATILE	13
+#else
+#define REG		"%08lx"
+#define REGS_PER_LINE	8
+#define LAST_VOLATILE	12
+#endif
+
+void show_regs(struct pt_regs * regs)
+{
+	int i, trap;
+
+	show_regs_print_info(KERN_DEFAULT);
+
+	printk("NIP:  "REG" LR: "REG" CTR: "REG"\n",
+	       regs->nip, regs->link, regs->ctr);
+	printk("REGS: %px TRAP: %04lx   %s  (%s)\n",
+	       regs, regs->trap, print_tainted(), init_utsname()->release);
+	printk("MSR:  "REG" ", regs->msr);
+	print_msr_bits(regs->msr);
+	pr_cont("  CR: %08lx  XER: %08lx\n", regs->ccr, regs->xer);
+	trap = TRAP(regs);
+	if ((TRAP(regs) != 0xc00) && cpu_has_feature(CPU_FTR_CFAR))
+		pr_cont("CFAR: "REG" ", regs->orig_gpr3);
+	if (trap == 0x200 || trap == 0x300 || trap == 0x600)
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+		pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
+#else
+		pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
+#endif
+#ifdef CONFIG_PPC64
+	pr_cont("IRQMASK: %lx ", regs->softe);
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (MSR_TM_ACTIVE(regs->msr))
+		pr_cont("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
+#endif
+
+	for (i = 0;  i < 32;  i++) {
+		if ((i % REGS_PER_LINE) == 0)
+			pr_cont("\nGPR%02d: ", i);
+		pr_cont(REG " ", regs->gpr[i]);
+		if (i == LAST_VOLATILE && !FULL_REGS(regs))
+			break;
+	}
+	pr_cont("\n");
+#ifdef CONFIG_KALLSYMS
+	/*
+	 * Lookup NIP late so we have the best change of getting the
+	 * above info out without failing
+	 */
+	printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip);
+	printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link);
+#endif
+	show_stack(current, (unsigned long *) regs->gpr[1]);
+	if (!user_mode(regs))
+		show_instructions(regs);
+}
+
+void flush_thread(void)
+{
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	flush_ptrace_hw_breakpoint(current);
+#else /* CONFIG_HAVE_HW_BREAKPOINT */
+	set_debug_reg_defaults(&current->thread);
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+}
+
+int set_thread_uses_vas(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -EINVAL;
+
+	current->thread.used_vas = 1;
+
+	/*
+	 * Even a process that has no foreign real address mapping can use
+	 * an unpaired COPY instruction (to no real effect). Issue CP_ABORT
+	 * to clear any pending COPY and prevent a covert channel.
+	 *
+	 * __switch_to() will issue CP_ABORT on future context switches.
+	 */
+	asm volatile(PPC_CP_ABORT);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+	return 0;
+}
+
+#ifdef CONFIG_PPC64
+/**
+ * Assign a TIDR (thread ID) for task @t and set it in the thread
+ * structure. For now, we only support setting TIDR for 'current' task.
+ *
+ * Since the TID value is a truncated form of it PID, it is possible
+ * (but unlikely) for 2 threads to have the same TID. In the unlikely event
+ * that 2 threads share the same TID and are waiting, one of the following
+ * cases will happen:
+ *
+ * 1. The correct thread is running, the wrong thread is not
+ * In this situation, the correct thread is woken and proceeds to pass it's
+ * condition check.
+ *
+ * 2. Neither threads are running
+ * In this situation, neither thread will be woken. When scheduled, the waiting
+ * threads will execute either a wait, which will return immediately, followed
+ * by a condition check, which will pass for the correct thread and fail
+ * for the wrong thread, or they will execute the condition check immediately.
+ *
+ * 3. The wrong thread is running, the correct thread is not
+ * The wrong thread will be woken, but will fail it's condition check and
+ * re-execute wait. The correct thread, when scheduled, will execute either
+ * it's condition check (which will pass), or wait, which returns immediately
+ * when called the first time after the thread is scheduled, followed by it's
+ * condition check (which will pass).
+ *
+ * 4. Both threads are running
+ * Both threads will be woken. The wrong thread will fail it's condition check
+ * and execute another wait, while the correct thread will pass it's condition
+ * check.
+ *
+ * @t: the task to set the thread ID for
+ */
+int set_thread_tidr(struct task_struct *t)
+{
+	if (!cpu_has_feature(CPU_FTR_P9_TIDR))
+		return -EINVAL;
+
+	if (t != current)
+		return -EINVAL;
+
+	if (t->thread.tidr)
+		return 0;
+
+	t->thread.tidr = (u16)task_pid_nr(t);
+	mtspr(SPRN_TIDR, t->thread.tidr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_thread_tidr);
+
+#endif /* CONFIG_PPC64 */
+
+void
+release_thread(struct task_struct *t)
+{
+}
+
+/*
+ * this gets called so that we can store coprocessor state into memory and
+ * copy the current task into the new thread.
+ */
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+	flush_all_to_thread(src);
+	/*
+	 * Flush TM state out so we can copy it.  __switch_to_tm() does this
+	 * flush but it removes the checkpointed state from the current CPU and
+	 * transitions the CPU out of TM mode.  Hence we need to call
+	 * tm_recheckpoint_new_task() (on the same task) to restore the
+	 * checkpointed state back and the TM mode.
+	 *
+	 * Can't pass dst because it isn't ready. Doesn't matter, passing
+	 * dst is only important for __switch_to()
+	 */
+	__switch_to_tm(src, src);
+
+	*dst = *src;
+
+	clear_task_ebb(dst);
+
+	return 0;
+}
+
+static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	unsigned long sp_vsid;
+	unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
+
+	if (radix_enabled())
+		return;
+
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T)
+			<< SLB_VSID_SHIFT_1T;
+	else
+		sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_256M)
+			<< SLB_VSID_SHIFT;
+	sp_vsid |= SLB_VSID_KERNEL | llp;
+	p->thread.ksp_vsid = sp_vsid;
+#endif
+}
+
+/*
+ * Copy a thread..
+ */
+
+/*
+ * Copy architecture-specific thread state
+ */
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long kthread_arg, struct task_struct *p)
+{
+	struct pt_regs *childregs, *kregs;
+	extern void ret_from_fork(void);
+	extern void ret_from_kernel_thread(void);
+	void (*f)(void);
+	unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
+	struct thread_info *ti = task_thread_info(p);
+
+	klp_init_thread_info(ti);
+
+	/* Copy registers */
+	sp -= sizeof(struct pt_regs);
+	childregs = (struct pt_regs *) sp;
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		/* kernel thread */
+		memset(childregs, 0, sizeof(struct pt_regs));
+		childregs->gpr[1] = sp + sizeof(struct pt_regs);
+		/* function */
+		if (usp)
+			childregs->gpr[14] = ppc_function_entry((void *)usp);
+#ifdef CONFIG_PPC64
+		clear_tsk_thread_flag(p, TIF_32BIT);
+		childregs->softe = IRQS_ENABLED;
+#endif
+		childregs->gpr[15] = kthread_arg;
+		p->thread.regs = NULL;	/* no user register state */
+		ti->flags |= _TIF_RESTOREALL;
+		f = ret_from_kernel_thread;
+	} else {
+		/* user thread */
+		struct pt_regs *regs = current_pt_regs();
+		CHECK_FULL_REGS(regs);
+		*childregs = *regs;
+		if (usp)
+			childregs->gpr[1] = usp;
+		p->thread.regs = childregs;
+		childregs->gpr[3] = 0;  /* Result from fork() */
+		if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_PPC64
+			if (!is_32bit_task())
+				childregs->gpr[13] = childregs->gpr[6];
+			else
+#endif
+				childregs->gpr[2] = childregs->gpr[6];
+		}
+
+		f = ret_from_fork;
+	}
+	childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX);
+	sp -= STACK_FRAME_OVERHEAD;
+
+	/*
+	 * The way this works is that at some point in the future
+	 * some task will call _switch to switch to the new task.
+	 * That will pop off the stack frame created below and start
+	 * the new task running at ret_from_fork.  The new task will
+	 * do some house keeping and then return from the fork or clone
+	 * system call, using the stack frame created above.
+	 */
+	((unsigned long *)sp)[0] = 0;
+	sp -= sizeof(struct pt_regs);
+	kregs = (struct pt_regs *) sp;
+	sp -= STACK_FRAME_OVERHEAD;
+	p->thread.ksp = sp;
+#ifdef CONFIG_PPC32
+	p->thread.ksp_limit = (unsigned long)task_stack_page(p) +
+				_ALIGN_UP(sizeof(struct thread_info), 16);
+#endif
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	p->thread.ptrace_bps[0] = NULL;
+#endif
+
+	p->thread.fp_save_area = NULL;
+#ifdef CONFIG_ALTIVEC
+	p->thread.vr_save_area = NULL;
+#endif
+
+	setup_ksp_vsid(p, sp);
+
+#ifdef CONFIG_PPC64 
+	if (cpu_has_feature(CPU_FTR_DSCR)) {
+		p->thread.dscr_inherit = current->thread.dscr_inherit;
+		p->thread.dscr = mfspr(SPRN_DSCR);
+	}
+	if (cpu_has_feature(CPU_FTR_HAS_PPR))
+		p->thread.ppr = INIT_PPR;
+
+	p->thread.tidr = 0;
+#endif
+	kregs->nip = ppc_function_entry(f);
+	return 0;
+}
+
+/*
+ * Set up a thread for executing a new program
+ */
+void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
+{
+#ifdef CONFIG_PPC64
+	unsigned long load_addr = regs->gpr[2];	/* saved by ELF_PLAT_INIT */
+#endif
+
+	/*
+	 * If we exec out of a kernel thread then thread.regs will not be
+	 * set.  Do it now.
+	 */
+	if (!current->thread.regs) {
+		struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE;
+		current->thread.regs = regs - 1;
+	}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * Clear any transactional state, we're exec()ing. The cause is
+	 * not important as there will never be a recheckpoint so it's not
+	 * user visible.
+	 */
+	if (MSR_TM_SUSPENDED(mfmsr()))
+		tm_reclaim_current(0);
+#endif
+
+	memset(regs->gpr, 0, sizeof(regs->gpr));
+	regs->ctr = 0;
+	regs->link = 0;
+	regs->xer = 0;
+	regs->ccr = 0;
+	regs->gpr[1] = sp;
+
+	/*
+	 * We have just cleared all the nonvolatile GPRs, so make
+	 * FULL_REGS(regs) return true.  This is necessary to allow
+	 * ptrace to examine the thread immediately after exec.
+	 */
+	regs->trap &= ~1UL;
+
+#ifdef CONFIG_PPC32
+	regs->mq = 0;
+	regs->nip = start;
+	regs->msr = MSR_USER;
+#else
+	if (!is_32bit_task()) {
+		unsigned long entry;
+
+		if (is_elf2_task()) {
+			/* Look ma, no function descriptors! */
+			entry = start;
+
+			/*
+			 * Ulrich says:
+			 *   The latest iteration of the ABI requires that when
+			 *   calling a function (at its global entry point),
+			 *   the caller must ensure r12 holds the entry point
+			 *   address (so that the function can quickly
+			 *   establish addressability).
+			 */
+			regs->gpr[12] = start;
+			/* Make sure that's restored on entry to userspace. */
+			set_thread_flag(TIF_RESTOREALL);
+		} else {
+			unsigned long toc;
+
+			/* start is a relocated pointer to the function
+			 * descriptor for the elf _start routine.  The first
+			 * entry in the function descriptor is the entry
+			 * address of _start and the second entry is the TOC
+			 * value we need to use.
+			 */
+			__get_user(entry, (unsigned long __user *)start);
+			__get_user(toc, (unsigned long __user *)start+1);
+
+			/* Check whether the e_entry function descriptor entries
+			 * need to be relocated before we can use them.
+			 */
+			if (load_addr != 0) {
+				entry += load_addr;
+				toc   += load_addr;
+			}
+			regs->gpr[2] = toc;
+		}
+		regs->nip = entry;
+		regs->msr = MSR_USER64;
+	} else {
+		regs->nip = start;
+		regs->gpr[2] = 0;
+		regs->msr = MSR_USER32;
+	}
+#endif
+#ifdef CONFIG_VSX
+	current->thread.used_vsr = 0;
+#endif
+	current->thread.load_fp = 0;
+	memset(&current->thread.fp_state, 0, sizeof(current->thread.fp_state));
+	current->thread.fp_save_area = NULL;
+#ifdef CONFIG_ALTIVEC
+	memset(&current->thread.vr_state, 0, sizeof(current->thread.vr_state));
+	current->thread.vr_state.vscr.u[3] = 0x00010000; /* Java mode disabled */
+	current->thread.vr_save_area = NULL;
+	current->thread.vrsave = 0;
+	current->thread.used_vr = 0;
+	current->thread.load_vec = 0;
+#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_SPE
+	memset(current->thread.evr, 0, sizeof(current->thread.evr));
+	current->thread.acc = 0;
+	current->thread.spefscr = 0;
+	current->thread.used_spe = 0;
+#endif /* CONFIG_SPE */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	current->thread.tm_tfhar = 0;
+	current->thread.tm_texasr = 0;
+	current->thread.tm_tfiar = 0;
+	current->thread.load_tm = 0;
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+	thread_pkey_regs_init(&current->thread);
+}
+EXPORT_SYMBOL(start_thread);
+
+#define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \
+		| PR_FP_EXC_RES | PR_FP_EXC_INV)
+
+int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
+{
+	struct pt_regs *regs = tsk->thread.regs;
+
+	/* This is a bit hairy.  If we are an SPE enabled  processor
+	 * (have embedded fp) we store the IEEE exception enable flags in
+	 * fpexc_mode.  fpexc_mode is also used for setting FP exception
+	 * mode (asyn, precise, disabled) for 'Classic' FP. */
+	if (val & PR_FP_EXC_SW_ENABLE) {
+#ifdef CONFIG_SPE
+		if (cpu_has_feature(CPU_FTR_SPE)) {
+			/*
+			 * When the sticky exception bits are set
+			 * directly by userspace, it must call prctl
+			 * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+			 * in the existing prctl settings) or
+			 * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+			 * the bits being set).  <fenv.h> functions
+			 * saving and restoring the whole
+			 * floating-point environment need to do so
+			 * anyway to restore the prctl settings from
+			 * the saved environment.
+			 */
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
+			tsk->thread.fpexc_mode = val &
+				(PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT);
+			return 0;
+		} else {
+			return -EINVAL;
+		}
+#else
+		return -EINVAL;
+#endif
+	}
+
+	/* on a CONFIG_SPE this does not hurt us.  The bits that
+	 * __pack_fe01 use do not overlap with bits used for
+	 * PR_FP_EXC_SW_ENABLE.  Additionally, the MSR[FE0,FE1] bits
+	 * on CONFIG_SPE implementations are reserved so writing to
+	 * them does not change anything */
+	if (val > PR_FP_EXC_PRECISE)
+		return -EINVAL;
+	tsk->thread.fpexc_mode = __pack_fe01(val);
+	if (regs != NULL && (regs->msr & MSR_FP) != 0)
+		regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1))
+			| tsk->thread.fpexc_mode;
+	return 0;
+}
+
+int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)
+{
+	unsigned int val;
+
+	if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE)
+#ifdef CONFIG_SPE
+		if (cpu_has_feature(CPU_FTR_SPE)) {
+			/*
+			 * When the sticky exception bits are set
+			 * directly by userspace, it must call prctl
+			 * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+			 * in the existing prctl settings) or
+			 * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+			 * the bits being set).  <fenv.h> functions
+			 * saving and restoring the whole
+			 * floating-point environment need to do so
+			 * anyway to restore the prctl settings from
+			 * the saved environment.
+			 */
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
+			val = tsk->thread.fpexc_mode;
+		} else
+			return -EINVAL;
+#else
+		return -EINVAL;
+#endif
+	else
+		val = __unpack_fe01(tsk->thread.fpexc_mode);
+	return put_user(val, (unsigned int __user *) adr);
+}
+
+int set_endian(struct task_struct *tsk, unsigned int val)
+{
+	struct pt_regs *regs = tsk->thread.regs;
+
+	if ((val == PR_ENDIAN_LITTLE && !cpu_has_feature(CPU_FTR_REAL_LE)) ||
+	    (val == PR_ENDIAN_PPC_LITTLE && !cpu_has_feature(CPU_FTR_PPC_LE)))
+		return -EINVAL;
+
+	if (regs == NULL)
+		return -EINVAL;
+
+	if (val == PR_ENDIAN_BIG)
+		regs->msr &= ~MSR_LE;
+	else if (val == PR_ENDIAN_LITTLE || val == PR_ENDIAN_PPC_LITTLE)
+		regs->msr |= MSR_LE;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+int get_endian(struct task_struct *tsk, unsigned long adr)
+{
+	struct pt_regs *regs = tsk->thread.regs;
+	unsigned int val;
+
+	if (!cpu_has_feature(CPU_FTR_PPC_LE) &&
+	    !cpu_has_feature(CPU_FTR_REAL_LE))
+		return -EINVAL;
+
+	if (regs == NULL)
+		return -EINVAL;
+
+	if (regs->msr & MSR_LE) {
+		if (cpu_has_feature(CPU_FTR_REAL_LE))
+			val = PR_ENDIAN_LITTLE;
+		else
+			val = PR_ENDIAN_PPC_LITTLE;
+	} else
+		val = PR_ENDIAN_BIG;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_unalign_ctl(struct task_struct *tsk, unsigned int val)
+{
+	tsk->thread.align_ctl = val;
+	return 0;
+}
+
+int get_unalign_ctl(struct task_struct *tsk, unsigned long adr)
+{
+	return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr);
+}
+
+static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
+				  unsigned long nbytes)
+{
+	unsigned long stack_page;
+	unsigned long cpu = task_cpu(p);
+
+	/*
+	 * Avoid crashing if the stack has overflowed and corrupted
+	 * task_cpu(p), which is in the thread_info struct.
+	 */
+	if (cpu < NR_CPUS && cpu_possible(cpu)) {
+		stack_page = (unsigned long) hardirq_ctx[cpu];
+		if (sp >= stack_page + sizeof(struct thread_struct)
+		    && sp <= stack_page + THREAD_SIZE - nbytes)
+			return 1;
+
+		stack_page = (unsigned long) softirq_ctx[cpu];
+		if (sp >= stack_page + sizeof(struct thread_struct)
+		    && sp <= stack_page + THREAD_SIZE - nbytes)
+			return 1;
+	}
+	return 0;
+}
+
+int validate_sp(unsigned long sp, struct task_struct *p,
+		       unsigned long nbytes)
+{
+	unsigned long stack_page = (unsigned long)task_stack_page(p);
+
+	if (sp >= stack_page + sizeof(struct thread_struct)
+	    && sp <= stack_page + THREAD_SIZE - nbytes)
+		return 1;
+
+	return valid_irq_stack(sp, p, nbytes);
+}
+
+EXPORT_SYMBOL(validate_sp);
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long ip, sp;
+	int count = 0;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
+	sp = p->thread.ksp;
+	if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD))
+		return 0;
+
+	do {
+		sp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+		if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) ||
+		    p->state == TASK_RUNNING)
+			return 0;
+		if (count > 0) {
+			ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]);
+			if (!in_sched_functions(ip))
+				return ip;
+		}
+	} while (count++ < 16);
+	return 0;
+}
+
+static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH;
+
+void show_stack(struct task_struct *tsk, unsigned long *stack)
+{
+	unsigned long sp, ip, lr, newsp;
+	int count = 0;
+	int firstframe = 1;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	int curr_frame = current->curr_ret_stack;
+	extern void return_to_handler(void);
+	unsigned long rth = (unsigned long)return_to_handler;
+#endif
+
+	sp = (unsigned long) stack;
+	if (tsk == NULL)
+		tsk = current;
+	if (sp == 0) {
+		if (tsk == current)
+			sp = current_stack_pointer();
+		else
+			sp = tsk->thread.ksp;
+	}
+
+	lr = 0;
+	printk("Call Trace:\n");
+	do {
+		if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD))
+			return;
+
+		stack = (unsigned long *) sp;
+		newsp = stack[0];
+		ip = stack[STACK_FRAME_LR_SAVE];
+		if (!firstframe || ip != lr) {
+			printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+			if ((ip == rth) && curr_frame >= 0) {
+				pr_cont(" (%pS)",
+				       (void *)current->ret_stack[curr_frame].ret);
+				curr_frame--;
+			}
+#endif
+			if (firstframe)
+				pr_cont(" (unreliable)");
+			pr_cont("\n");
+		}
+		firstframe = 0;
+
+		/*
+		 * See if this is an exception frame.
+		 * We look for the "regshere" marker in the current frame.
+		 */
+		if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
+		    && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+			struct pt_regs *regs = (struct pt_regs *)
+				(sp + STACK_FRAME_OVERHEAD);
+			lr = regs->link;
+			printk("--- interrupt: %lx at %pS\n    LR = %pS\n",
+			       regs->trap, (void *)regs->nip, (void *)lr);
+			firstframe = 1;
+		}
+
+		sp = newsp;
+	} while (count++ < kstack_depth_to_print);
+}
+
+#ifdef CONFIG_PPC64
+/* Called with hard IRQs off */
+void notrace __ppc64_runlatch_on(void)
+{
+	struct thread_info *ti = current_thread_info();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		/*
+		 * Least significant bit (RUN) is the only writable bit of
+		 * the CTRL register, so we can avoid mfspr. 2.06 is not the
+		 * earliest ISA where this is the case, but it's convenient.
+		 */
+		mtspr(SPRN_CTRLT, CTRL_RUNLATCH);
+	} else {
+		unsigned long ctrl;
+
+		/*
+		 * Some architectures (e.g., Cell) have writable fields other
+		 * than RUN, so do the read-modify-write.
+		 */
+		ctrl = mfspr(SPRN_CTRLF);
+		ctrl |= CTRL_RUNLATCH;
+		mtspr(SPRN_CTRLT, ctrl);
+	}
+
+	ti->local_flags |= _TLF_RUNLATCH;
+}
+
+/* Called with hard IRQs off */
+void notrace __ppc64_runlatch_off(void)
+{
+	struct thread_info *ti = current_thread_info();
+
+	ti->local_flags &= ~_TLF_RUNLATCH;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		mtspr(SPRN_CTRLT, 0);
+	} else {
+		unsigned long ctrl;
+
+		ctrl = mfspr(SPRN_CTRLF);
+		ctrl &= ~CTRL_RUNLATCH;
+		mtspr(SPRN_CTRLT, ctrl);
+	}
+}
+#endif /* CONFIG_PPC64 */
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		sp -= get_random_int() & ~PAGE_MASK;
+	return sp & ~0xf;
+}
+
+static inline unsigned long brk_rnd(void)
+{
+        unsigned long rnd = 0;
+
+	/* 8MB for 32bit, 1GB for 64bit */
+	if (is_32bit_task())
+		rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
+	else
+		rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
+
+	return rnd << PAGE_SHIFT;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long base = mm->brk;
+	unsigned long ret;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * If we are using 1TB segments and we are allowed to randomise
+	 * the heap, we can put it above 1TB so it is backed by a 1TB
+	 * segment. Otherwise the heap will be in the bottom 1TB
+	 * which always uses 256MB segments and this may result in a
+	 * performance penalty. We don't need to worry about radix. For
+	 * radix, mmu_highuser_ssize remains unchanged from 256MB.
+	 */
+	if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T))
+		base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T);
+#endif
+
+	ret = PAGE_ALIGN(base + brk_rnd());
+
+	if (ret < mm->brk)
+		return mm->brk;
+
+	return ret;
+}
+
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
new file mode 100644
index 000000000..f8c49e5d4
--- /dev/null
+++ b/arch/powerpc/kernel/prom.c
@@ -0,0 +1,918 @@
+/*
+ * Procedures for creating, accessing and interpreting the device tree.
+ *
+ * Paul Mackerras	August 1996.
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ * 
+ *  Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ *    {engebret|bergner}@us.ibm.com 
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <stdarg.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+#include <linux/kexec.h>
+#include <linux/irq.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/cpu.h>
+
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <asm/kdump.h>
+#include <asm/smp.h>
+#include <asm/mmu.h>
+#include <asm/paca.h>
+#include <asm/pgtable.h>
+#include <asm/powernv.h>
+#include <asm/iommu.h>
+#include <asm/btext.h>
+#include <asm/sections.h>
+#include <asm/machdep.h>
+#include <asm/pci-bridge.h>
+#include <asm/kexec.h>
+#include <asm/opal.h>
+#include <asm/fadump.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/firmware.h>
+#include <asm/dt_cpu_ftrs.h>
+#include <asm/drmem.h>
+
+#include <mm/mmu_decl.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(KERN_ERR fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef CONFIG_PPC64
+int __initdata iommu_is_off;
+int __initdata iommu_force_on;
+unsigned long tce_alloc_start, tce_alloc_end;
+u64 ppc64_rma_size;
+#endif
+static phys_addr_t first_memblock_size;
+static int __initdata boot_cpu_count;
+
+static int __init early_parse_mem(char *p)
+{
+	if (!p)
+		return 1;
+
+	memory_limit = PAGE_ALIGN(memparse(p, &p));
+	DBG("memory limit = 0x%llx\n", memory_limit);
+
+	return 0;
+}
+early_param("mem", early_parse_mem);
+
+/*
+ * overlaps_initrd - check for overlap with page aligned extension of
+ * initrd.
+ */
+static inline int overlaps_initrd(unsigned long start, unsigned long size)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (!initrd_start)
+		return 0;
+
+	return	(start + size) > _ALIGN_DOWN(initrd_start, PAGE_SIZE) &&
+			start <= _ALIGN_UP(initrd_end, PAGE_SIZE);
+#else
+	return 0;
+#endif
+}
+
+/**
+ * move_device_tree - move tree to an unused area, if needed.
+ *
+ * The device tree may be allocated beyond our memory limit, or inside the
+ * crash kernel region for kdump, or within the page aligned range of initrd.
+ * If so, move it out of the way.
+ */
+static void __init move_device_tree(void)
+{
+	unsigned long start, size;
+	void *p;
+
+	DBG("-> move_device_tree\n");
+
+	start = __pa(initial_boot_params);
+	size = fdt_totalsize(initial_boot_params);
+
+	if ((memory_limit && (start + size) > PHYSICAL_START + memory_limit) ||
+			overlaps_crashkernel(start, size) ||
+			overlaps_initrd(start, size)) {
+		p = __va(memblock_alloc(size, PAGE_SIZE));
+		memcpy(p, initial_boot_params, size);
+		initial_boot_params = p;
+		DBG("Moved device tree to 0x%px\n", p);
+	}
+
+	DBG("<- move_device_tree\n");
+}
+
+/*
+ * ibm,pa-features is a per-cpu property that contains a string of
+ * attribute descriptors, each of which has a 2 byte header plus up
+ * to 254 bytes worth of processor attribute bits.  First header
+ * byte specifies the number of bytes following the header.
+ * Second header byte is an "attribute-specifier" type, of which
+ * zero is the only currently-defined value.
+ * Implementation:  Pass in the byte and bit offset for the feature
+ * that we are interested in.  The function will return -1 if the
+ * pa-features property is missing, or a 1/0 to indicate if the feature
+ * is supported/not supported.  Note that the bit numbers are
+ * big-endian to match the definition in PAPR.
+ */
+static struct ibm_pa_feature {
+	unsigned long	cpu_features;	/* CPU_FTR_xxx bit */
+	unsigned long	mmu_features;	/* MMU_FTR_xxx bit */
+	unsigned int	cpu_user_ftrs;	/* PPC_FEATURE_xxx bit */
+	unsigned int	cpu_user_ftrs2;	/* PPC_FEATURE2_xxx bit */
+	unsigned char	pabyte;		/* byte number in ibm,pa-features */
+	unsigned char	pabit;		/* bit number (big-endian) */
+	unsigned char	invert;		/* if 1, pa bit set => clear feature */
+} ibm_pa_features[] __initdata = {
+	{ .pabyte = 0,  .pabit = 0, .cpu_user_ftrs = PPC_FEATURE_HAS_MMU },
+	{ .pabyte = 0,  .pabit = 1, .cpu_user_ftrs = PPC_FEATURE_HAS_FPU },
+	{ .pabyte = 0,  .pabit = 3, .cpu_features  = CPU_FTR_CTRL },
+	{ .pabyte = 0,  .pabit = 6, .cpu_features  = CPU_FTR_NOEXECUTE },
+	{ .pabyte = 1,  .pabit = 2, .mmu_features  = MMU_FTR_CI_LARGE_PAGE },
+#ifdef CONFIG_PPC_RADIX_MMU
+	{ .pabyte = 40, .pabit = 0, .mmu_features  = MMU_FTR_TYPE_RADIX },
+#endif
+	{ .pabyte = 1,  .pabit = 1, .invert = 1, .cpu_features = CPU_FTR_NODSISRALIGN },
+	{ .pabyte = 5,  .pabit = 0, .cpu_features  = CPU_FTR_REAL_LE,
+				    .cpu_user_ftrs = PPC_FEATURE_TRUE_LE },
+	/*
+	 * If the kernel doesn't support TM (ie CONFIG_PPC_TRANSACTIONAL_MEM=n),
+	 * we don't want to turn on TM here, so we use the *_COMP versions
+	 * which are 0 if the kernel doesn't support TM.
+	 */
+	{ .pabyte = 22, .pabit = 0, .cpu_features = CPU_FTR_TM_COMP,
+	  .cpu_user_ftrs2 = PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_HTM_NOSC_COMP },
+};
+
+static void __init scan_features(unsigned long node, const unsigned char *ftrs,
+				 unsigned long tablelen,
+				 struct ibm_pa_feature *fp,
+				 unsigned long ft_size)
+{
+	unsigned long i, len, bit;
+
+	/* find descriptor with type == 0 */
+	for (;;) {
+		if (tablelen < 3)
+			return;
+		len = 2 + ftrs[0];
+		if (tablelen < len)
+			return;		/* descriptor 0 not found */
+		if (ftrs[1] == 0)
+			break;
+		tablelen -= len;
+		ftrs += len;
+	}
+
+	/* loop over bits we know about */
+	for (i = 0; i < ft_size; ++i, ++fp) {
+		if (fp->pabyte >= ftrs[0])
+			continue;
+		bit = (ftrs[2 + fp->pabyte] >> (7 - fp->pabit)) & 1;
+		if (bit ^ fp->invert) {
+			cur_cpu_spec->cpu_features |= fp->cpu_features;
+			cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftrs;
+			cur_cpu_spec->cpu_user_features2 |= fp->cpu_user_ftrs2;
+			cur_cpu_spec->mmu_features |= fp->mmu_features;
+		} else {
+			cur_cpu_spec->cpu_features &= ~fp->cpu_features;
+			cur_cpu_spec->cpu_user_features &= ~fp->cpu_user_ftrs;
+			cur_cpu_spec->cpu_user_features2 &= ~fp->cpu_user_ftrs2;
+			cur_cpu_spec->mmu_features &= ~fp->mmu_features;
+		}
+	}
+}
+
+static void __init check_cpu_pa_features(unsigned long node)
+{
+	const unsigned char *pa_ftrs;
+	int tablelen;
+
+	pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen);
+	if (pa_ftrs == NULL)
+		return;
+
+	scan_features(node, pa_ftrs, tablelen,
+		      ibm_pa_features, ARRAY_SIZE(ibm_pa_features));
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static void __init init_mmu_slb_size(unsigned long node)
+{
+	const __be32 *slb_size_ptr;
+
+	slb_size_ptr = of_get_flat_dt_prop(node, "slb-size", NULL) ? :
+			of_get_flat_dt_prop(node, "ibm,slb-size", NULL);
+
+	if (slb_size_ptr)
+		mmu_slb_size = be32_to_cpup(slb_size_ptr);
+}
+#else
+#define init_mmu_slb_size(node) do { } while(0)
+#endif
+
+static struct feature_property {
+	const char *name;
+	u32 min_value;
+	unsigned long cpu_feature;
+	unsigned long cpu_user_ftr;
+} feature_properties[] __initdata = {
+#ifdef CONFIG_ALTIVEC
+	{"altivec", 0, CPU_FTR_ALTIVEC, PPC_FEATURE_HAS_ALTIVEC},
+	{"ibm,vmx", 1, CPU_FTR_ALTIVEC, PPC_FEATURE_HAS_ALTIVEC},
+#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_VSX
+	/* Yes, this _really_ is ibm,vmx == 2 to enable VSX */
+	{"ibm,vmx", 2, CPU_FTR_VSX, PPC_FEATURE_HAS_VSX},
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_PPC64
+	{"ibm,dfp", 1, 0, PPC_FEATURE_HAS_DFP},
+	{"ibm,purr", 1, CPU_FTR_PURR, 0},
+	{"ibm,spurr", 1, CPU_FTR_SPURR, 0},
+#endif /* CONFIG_PPC64 */
+};
+
+#if defined(CONFIG_44x) && defined(CONFIG_PPC_FPU)
+static __init void identical_pvr_fixup(unsigned long node)
+{
+	unsigned int pvr;
+	const char *model = of_get_flat_dt_prop(node, "model", NULL);
+
+	/*
+	 * Since 440GR(x)/440EP(x) processors have the same pvr,
+	 * we check the node path and set bit 28 in the cur_cpu_spec
+	 * pvr for EP(x) processor version. This bit is always 0 in
+	 * the "real" pvr. Then we call identify_cpu again with
+	 * the new logical pvr to enable FPU support.
+	 */
+	if (model && strstr(model, "440EP")) {
+		pvr = cur_cpu_spec->pvr_value | 0x8;
+		identify_cpu(0, pvr);
+		DBG("Using logical pvr %x for %s\n", pvr, model);
+	}
+}
+#else
+#define identical_pvr_fixup(node) do { } while(0)
+#endif
+
+static void __init check_cpu_feature_properties(unsigned long node)
+{
+	int i;
+	struct feature_property *fp = feature_properties;
+	const __be32 *prop;
+
+	for (i = 0; i < (int)ARRAY_SIZE(feature_properties); ++i, ++fp) {
+		prop = of_get_flat_dt_prop(node, fp->name, NULL);
+		if (prop && be32_to_cpup(prop) >= fp->min_value) {
+			cur_cpu_spec->cpu_features |= fp->cpu_feature;
+			cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftr;
+		}
+	}
+}
+
+static int __init early_init_dt_scan_cpus(unsigned long node,
+					  const char *uname, int depth,
+					  void *data)
+{
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	const __be32 *intserv;
+	int i, nthreads;
+	int len;
+	int found = -1;
+	int found_thread = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	/* Get physical cpuid */
+	intserv = of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", &len);
+	if (!intserv)
+		intserv = of_get_flat_dt_prop(node, "reg", &len);
+
+	nthreads = len / sizeof(int);
+
+	/*
+	 * Now see if any of these threads match our boot cpu.
+	 * NOTE: This must match the parsing done in smp_setup_cpu_maps.
+	 */
+	for (i = 0; i < nthreads; i++) {
+		if (be32_to_cpu(intserv[i]) ==
+			fdt_boot_cpuid_phys(initial_boot_params)) {
+			found = boot_cpu_count;
+			found_thread = i;
+		}
+#ifdef CONFIG_SMP
+		/* logical cpu id is always 0 on UP kernels */
+		boot_cpu_count++;
+#endif
+	}
+
+	/* Not the boot CPU */
+	if (found < 0)
+		return 0;
+
+	DBG("boot cpu: logical %d physical %d\n", found,
+	    be32_to_cpu(intserv[found_thread]));
+	boot_cpuid = found;
+
+	/*
+	 * PAPR defines "logical" PVR values for cpus that
+	 * meet various levels of the architecture:
+	 * 0x0f000001	Architecture version 2.04
+	 * 0x0f000002	Architecture version 2.05
+	 * If the cpu-version property in the cpu node contains
+	 * such a value, we call identify_cpu again with the
+	 * logical PVR value in order to use the cpu feature
+	 * bits appropriate for the architecture level.
+	 *
+	 * A POWER6 partition in "POWER6 architected" mode
+	 * uses the 0x0f000002 PVR value; in POWER5+ mode
+	 * it uses 0x0f000001.
+	 *
+	 * If we're using device tree CPU feature discovery then we don't
+	 * support the cpu-version property, and it's the responsibility of the
+	 * firmware/hypervisor to provide the correct feature set for the
+	 * architecture level via the ibm,powerpc-cpu-features binding.
+	 */
+	if (!dt_cpu_ftrs_in_use()) {
+		prop = of_get_flat_dt_prop(node, "cpu-version", NULL);
+		if (prop && (be32_to_cpup(prop) & 0xff000000) == 0x0f000000)
+			identify_cpu(0, be32_to_cpup(prop));
+
+		check_cpu_feature_properties(node);
+		check_cpu_pa_features(node);
+	}
+
+	identical_pvr_fixup(node);
+	init_mmu_slb_size(node);
+
+#ifdef CONFIG_PPC64
+	if (nthreads == 1)
+		cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT;
+	else if (!dt_cpu_ftrs_in_use())
+		cur_cpu_spec->cpu_features |= CPU_FTR_SMT;
+	allocate_paca(boot_cpuid);
+#endif
+	set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread]));
+
+	return 0;
+}
+
+static int __init early_init_dt_scan_chosen_ppc(unsigned long node,
+						const char *uname,
+						int depth, void *data)
+{
+	const unsigned long *lprop; /* All these set by kernel, so no need to convert endian */
+
+	/* Use common scan routine to determine if this is the chosen node */
+	if (early_init_dt_scan_chosen(node, uname, depth, data) == 0)
+		return 0;
+
+#ifdef CONFIG_PPC64
+	/* check if iommu is forced on or off */
+	if (of_get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL)
+		iommu_is_off = 1;
+	if (of_get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
+		iommu_force_on = 1;
+#endif
+
+	/* mem=x on the command line is the preferred mechanism */
+	lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL);
+	if (lprop)
+		memory_limit = *lprop;
+
+#ifdef CONFIG_PPC64
+	lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
+	if (lprop)
+		tce_alloc_start = *lprop;
+	lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
+	if (lprop)
+		tce_alloc_end = *lprop;
+#endif
+
+#ifdef CONFIG_KEXEC_CORE
+	lprop = of_get_flat_dt_prop(node, "linux,crashkernel-base", NULL);
+	if (lprop)
+		crashk_res.start = *lprop;
+
+	lprop = of_get_flat_dt_prop(node, "linux,crashkernel-size", NULL);
+	if (lprop)
+		crashk_res.end = crashk_res.start + *lprop - 1;
+#endif
+
+	/* break now */
+	return 1;
+}
+
+/*
+ * Compare the range against max mem limit and update
+ * size if it cross the limit.
+ */
+
+#ifdef CONFIG_SPARSEMEM
+static bool validate_mem_limit(u64 base, u64 *size)
+{
+	u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
+
+	if (base >= max_mem)
+		return false;
+	if ((base + *size) > max_mem)
+		*size = max_mem - base;
+	return true;
+}
+#else
+static bool validate_mem_limit(u64 base, u64 *size)
+{
+	return true;
+}
+#endif
+
+#ifdef CONFIG_PPC_PSERIES
+/*
+ * Interpret the ibm dynamic reconfiguration memory LMBs.
+ * This contains a list of memory blocks along with NUMA affinity
+ * information.
+ */
+static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
+					const __be32 **usm)
+{
+	u64 base, size;
+	int is_kexec_kdump = 0, rngs;
+
+	base = lmb->base_addr;
+	size = drmem_lmb_size();
+	rngs = 1;
+
+	/*
+	 * Skip this block if the reserved bit is set in flags
+	 * or if the block is not assigned to this partition.
+	 */
+	if ((lmb->flags & DRCONF_MEM_RESERVED) ||
+	    !(lmb->flags & DRCONF_MEM_ASSIGNED))
+		return;
+
+	if (*usm)
+		is_kexec_kdump = 1;
+
+	if (is_kexec_kdump) {
+		/*
+		 * For each memblock in ibm,dynamic-memory, a
+		 * corresponding entry in linux,drconf-usable-memory
+		 * property contains a counter 'p' followed by 'p'
+		 * (base, size) duple. Now read the counter from
+		 * linux,drconf-usable-memory property
+		 */
+		rngs = dt_mem_next_cell(dt_root_size_cells, usm);
+		if (!rngs) /* there are no (base, size) duple */
+			return;
+	}
+
+	do {
+		if (is_kexec_kdump) {
+			base = dt_mem_next_cell(dt_root_addr_cells, usm);
+			size = dt_mem_next_cell(dt_root_size_cells, usm);
+		}
+
+		if (iommu_is_off) {
+			if (base >= 0x80000000ul)
+				continue;
+			if ((base + size) > 0x80000000ul)
+				size = 0x80000000ul - base;
+		}
+
+		DBG("Adding: %llx -> %llx\n", base, size);
+		if (validate_mem_limit(base, &size))
+			memblock_add(base, size);
+	} while (--rngs);
+}
+#endif /* CONFIG_PPC_PSERIES */
+
+static int __init early_init_dt_scan_memory_ppc(unsigned long node,
+						const char *uname,
+						int depth, void *data)
+{
+#ifdef CONFIG_PPC_PSERIES
+	if (depth == 1 &&
+	    strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
+		walk_drmem_lmbs_early(node, early_init_drmem_lmb);
+		return 0;
+	}
+#endif
+	
+	return early_init_dt_scan_memory(node, uname, depth, data);
+}
+
+/*
+ * For a relocatable kernel, we need to get the memstart_addr first,
+ * then use it to calculate the virtual kernel start address. This has
+ * to happen at a very early stage (before machine_init). In this case,
+ * we just want to get the memstart_address and would not like to mess the
+ * memblock at this stage. So introduce a variable to skip the memblock_add()
+ * for this reason.
+ */
+#ifdef CONFIG_RELOCATABLE
+static int add_mem_to_memblock = 1;
+#else
+#define add_mem_to_memblock 1
+#endif
+
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+#ifdef CONFIG_PPC64
+	if (iommu_is_off) {
+		if (base >= 0x80000000ul)
+			return;
+		if ((base + size) > 0x80000000ul)
+			size = 0x80000000ul - base;
+	}
+#endif
+	/* Keep track of the beginning of memory -and- the size of
+	 * the very first block in the device-tree as it represents
+	 * the RMA on ppc64 server
+	 */
+	if (base < memstart_addr) {
+		memstart_addr = base;
+		first_memblock_size = size;
+	}
+
+	/* Add the chunk to the MEMBLOCK list */
+	if (add_mem_to_memblock) {
+		if (validate_mem_limit(base, &size))
+			memblock_add(base, size);
+	}
+}
+
+static void __init early_reserve_mem_dt(void)
+{
+	unsigned long i, dt_root;
+	int len;
+	const __be32 *prop;
+
+	early_init_fdt_reserve_self();
+	early_init_fdt_scan_reserved_mem();
+
+	dt_root = of_get_flat_dt_root();
+
+	prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
+
+	if (!prop)
+		return;
+
+	DBG("Found new-style reserved-ranges\n");
+
+	/* Each reserved range is an (address,size) pair, 2 cells each,
+	 * totalling 4 cells per range. */
+	for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+		u64 base, size;
+
+		base = of_read_number(prop + (i * 4) + 0, 2);
+		size = of_read_number(prop + (i * 4) + 2, 2);
+
+		if (size) {
+			DBG("reserving: %llx -> %llx\n", base, size);
+			memblock_reserve(base, size);
+		}
+	}
+}
+
+static void __init early_reserve_mem(void)
+{
+	__be64 *reserve_map;
+
+	reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
+			fdt_off_mem_rsvmap(initial_boot_params));
+
+	/* Look for the new "reserved-regions" property in the DT */
+	early_reserve_mem_dt();
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Then reserve the initrd, if any */
+	if (initrd_start && (initrd_end > initrd_start)) {
+		memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
+			_ALIGN_UP(initrd_end, PAGE_SIZE) -
+			_ALIGN_DOWN(initrd_start, PAGE_SIZE));
+	}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+#ifdef CONFIG_PPC32
+	/* 
+	 * Handle the case where we might be booting from an old kexec
+	 * image that setup the mem_rsvmap as pairs of 32-bit values
+	 */
+	if (be64_to_cpup(reserve_map) > 0xffffffffull) {
+		u32 base_32, size_32;
+		__be32 *reserve_map_32 = (__be32 *)reserve_map;
+
+		DBG("Found old 32-bit reserve map\n");
+
+		while (1) {
+			base_32 = be32_to_cpup(reserve_map_32++);
+			size_32 = be32_to_cpup(reserve_map_32++);
+			if (size_32 == 0)
+				break;
+			DBG("reserving: %x -> %x\n", base_32, size_32);
+			memblock_reserve(base_32, size_32);
+		}
+		return;
+	}
+#endif
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static bool tm_disabled __initdata;
+
+static int __init parse_ppc_tm(char *str)
+{
+	bool res;
+
+	if (kstrtobool(str, &res))
+		return -EINVAL;
+
+	tm_disabled = !res;
+
+	return 0;
+}
+early_param("ppc_tm", parse_ppc_tm);
+
+static void __init tm_init(void)
+{
+	if (tm_disabled) {
+		pr_info("Disabling hardware transactional memory (HTM)\n");
+		cur_cpu_spec->cpu_user_features2 &=
+			~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
+		cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
+		return;
+	}
+
+	pnv_tm_init();
+}
+#else
+static void tm_init(void) { }
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+#ifdef CONFIG_PPC64
+static void __init save_fscr_to_task(void)
+{
+	/*
+	 * Ensure the init_task (pid 0, aka swapper) uses the value of FSCR we
+	 * have configured via the device tree features or via __init_FSCR().
+	 * That value will then be propagated to pid 1 (init) and all future
+	 * processes.
+	 */
+	if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+		init_task.thread.fscr = mfspr(SPRN_FSCR);
+}
+#else
+static inline void save_fscr_to_task(void) {};
+#endif
+
+
+void __init early_init_devtree(void *params)
+{
+	phys_addr_t limit;
+
+	DBG(" -> early_init_devtree(%px)\n", params);
+
+	/* Too early to BUG_ON(), do it by hand */
+	if (!early_init_dt_verify(params))
+		panic("BUG: Failed verifying flat device tree, bad version?");
+
+#ifdef CONFIG_PPC_RTAS
+	/* Some machines might need RTAS info for debugging, grab it now. */
+	of_scan_flat_dt(early_init_dt_scan_rtas, NULL);
+#endif
+
+#ifdef CONFIG_PPC_POWERNV
+	/* Some machines might need OPAL info for debugging, grab it now. */
+	of_scan_flat_dt(early_init_dt_scan_opal, NULL);
+#endif
+
+#ifdef CONFIG_FA_DUMP
+	/* scan tree to see if dump is active during last boot */
+	of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
+#endif
+
+	/* Retrieve various informations from the /chosen node of the
+	 * device-tree, including the platform type, initrd location and
+	 * size, TCE reserve, and more ...
+	 */
+	of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);
+
+	/* Scan memory nodes and rebuild MEMBLOCKs */
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+	of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
+
+	parse_early_param();
+
+	/* make sure we've parsed cmdline for mem= before this */
+	if (memory_limit)
+		first_memblock_size = min_t(u64, first_memblock_size, memory_limit);
+	setup_initial_memory_limit(memstart_addr, first_memblock_size);
+	/* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */
+	memblock_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START);
+	/* If relocatable, reserve first 32k for interrupt vectors etc. */
+	if (PHYSICAL_START > MEMORY_START)
+		memblock_reserve(MEMORY_START, 0x8000);
+	reserve_kdump_trampoline();
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * If we fail to reserve memory for firmware-assisted dump then
+	 * fallback to kexec based kdump.
+	 */
+	if (fadump_reserve_mem() == 0)
+#endif
+		reserve_crashkernel();
+	early_reserve_mem();
+
+	/* Ensure that total memory size is page-aligned. */
+	limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
+	memblock_enforce_memory_limit(limit);
+
+	memblock_allow_resize();
+	memblock_dump_all();
+
+	DBG("Phys. mem: %llx\n", (unsigned long long)memblock_phys_mem_size());
+
+	/* We may need to relocate the flat tree, do it now.
+	 * FIXME .. and the initrd too? */
+	move_device_tree();
+
+	allocate_paca_ptrs();
+
+	DBG("Scanning CPUs ...\n");
+
+	dt_cpu_ftrs_scan();
+
+	/* Retrieve CPU related informations from the flat tree
+	 * (altivec support, boot CPU ID, ...)
+	 */
+	of_scan_flat_dt(early_init_dt_scan_cpus, NULL);
+	if (boot_cpuid < 0) {
+		printk("Failed to identify boot CPU !\n");
+		BUG();
+	}
+
+	save_fscr_to_task();
+
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
+	/* We'll later wait for secondaries to check in; there are
+	 * NCPUS-1 non-boot CPUs  :-)
+	 */
+	spinning_secondaries = boot_cpu_count - 1;
+#endif
+
+	mmu_early_init_devtree();
+
+#ifdef CONFIG_PPC_POWERNV
+	/* Scan and build the list of machine check recoverable ranges */
+	of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
+#endif
+	epapr_paravirt_early_init();
+
+	/* Now try to figure out if we are running on LPAR and so on */
+	pseries_probe_fw_features();
+
+#ifdef CONFIG_PPC_PS3
+	/* Identify PS3 firmware */
+	if (of_flat_dt_is_compatible(of_get_flat_dt_root(), "sony,ps3"))
+		powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE;
+#endif
+
+	tm_init();
+
+	DBG(" <- early_init_devtree()\n");
+}
+
+#ifdef CONFIG_RELOCATABLE
+/*
+ * This function run before early_init_devtree, so we have to init
+ * initial_boot_params.
+ */
+void __init early_get_first_memblock_info(void *params, phys_addr_t *size)
+{
+	/* Setup flat device-tree pointer */
+	initial_boot_params = params;
+
+	/*
+	 * Scan the memory nodes and set add_mem_to_memblock to 0 to avoid
+	 * mess the memblock.
+	 */
+	add_mem_to_memblock = 0;
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+	of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
+	add_mem_to_memblock = 1;
+
+	if (size)
+		*size = first_memblock_size;
+}
+#endif
+
+/*******
+ *
+ * New implementation of the OF "find" APIs, return a refcounted
+ * object, call of_node_put() when done.  The device tree and list
+ * are protected by a rw_lock.
+ *
+ * Note that property management will need some locking as well,
+ * this isn't dealt with yet.
+ *
+ *******/
+
+/**
+ * of_get_ibm_chip_id - Returns the IBM "chip-id" of a device
+ * @np: device node of the device
+ *
+ * This looks for a property "ibm,chip-id" in the node or any
+ * of its parents and returns its content, or -1 if it cannot
+ * be found.
+ */
+int of_get_ibm_chip_id(struct device_node *np)
+{
+	of_node_get(np);
+	while (np) {
+		u32 chip_id;
+
+		/*
+		 * Skiboot may produce memory nodes that contain more than one
+		 * cell in chip-id, we only read the first one here.
+		 */
+		if (!of_property_read_u32(np, "ibm,chip-id", &chip_id)) {
+			of_node_put(np);
+			return chip_id;
+		}
+
+		np = of_get_next_parent(np);
+	}
+	return -1;
+}
+EXPORT_SYMBOL(of_get_ibm_chip_id);
+
+/**
+ * cpu_to_chip_id - Return the cpus chip-id
+ * @cpu: The logical cpu number.
+ *
+ * Return the value of the ibm,chip-id property corresponding to the given
+ * logical cpu number. If the chip-id can not be found, returns -1.
+ */
+int cpu_to_chip_id(int cpu)
+{
+	struct device_node *np;
+
+	np = of_get_cpu_node(cpu, NULL);
+	if (!np)
+		return -1;
+
+	of_node_put(np);
+	return of_get_ibm_chip_id(np);
+}
+EXPORT_SYMBOL(cpu_to_chip_id);
+
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * Early firmware scanning must use this rather than
+	 * get_hard_smp_processor_id because we don't have pacas allocated
+	 * until memory topology is discovered.
+	 */
+	if (cpu_to_phys_id != NULL)
+		return (int)phys_id == cpu_to_phys_id[cpu];
+#endif
+
+	return (int)phys_id == get_hard_smp_processor_id(cpu);
+}
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
new file mode 100644
index 000000000..29a8087a4
--- /dev/null
+++ b/arch/powerpc/kernel/prom_init.c
@@ -0,0 +1,3327 @@
+/*
+ * Procedures for interfacing to Open Firmware.
+ *
+ * Paul Mackerras	August 1996.
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ * 
+ *  Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ *    {engebret|bergner}@us.ibm.com 
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG_PROM
+
+/* we cannot use FORTIFY as it brings in new symbols */
+#define __NO_FORTIFY
+
+#include <stdarg.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/bitops.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/iommu.h>
+#include <asm/btext.h>
+#include <asm/sections.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/asm-prototypes.h>
+
+#include <linux/linux_logo.h>
+
+/*
+ * Eventually bump that one up
+ */
+#define DEVTREE_CHUNK_SIZE	0x100000
+
+/*
+ * This is the size of the local memory reserve map that gets copied
+ * into the boot params passed to the kernel. That size is totally
+ * flexible as the kernel just reads the list until it encounters an
+ * entry with size 0, so it can be changed without breaking binary
+ * compatibility
+ */
+#define MEM_RESERVE_MAP_SIZE	8
+
+/*
+ * prom_init() is called very early on, before the kernel text
+ * and data have been mapped to KERNELBASE.  At this point the code
+ * is running at whatever address it has been loaded at.
+ * On ppc32 we compile with -mrelocatable, which means that references
+ * to extern and static variables get relocated automatically.
+ * ppc64 objects are always relocatable, we just need to relocate the
+ * TOC.
+ *
+ * Because OF may have mapped I/O devices into the area starting at
+ * KERNELBASE, particularly on CHRP machines, we can't safely call
+ * OF once the kernel has been mapped to KERNELBASE.  Therefore all
+ * OF calls must be done within prom_init().
+ *
+ * ADDR is used in calls to call_prom.  The 4th and following
+ * arguments to call_prom should be 32-bit values.
+ * On ppc64, 64 bit values are truncated to 32 bits (and
+ * fortunately don't get interpreted as two arguments).
+ */
+#define ADDR(x)		(u32)(unsigned long)(x)
+
+#ifdef CONFIG_PPC64
+#define OF_WORKAROUNDS	0
+#else
+#define OF_WORKAROUNDS	of_workarounds
+int of_workarounds;
+#endif
+
+#define OF_WA_CLAIM	1	/* do phys/virt claim separately, then map */
+#define OF_WA_LONGTRAIL	2	/* work around longtrail bugs */
+
+#define PROM_BUG() do {						\
+        prom_printf("kernel BUG at %s line 0x%x!\n",		\
+		    __FILE__, __LINE__);			\
+        __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR);	\
+} while (0)
+
+#ifdef DEBUG_PROM
+#define prom_debug(x...)	prom_printf(x)
+#else
+#define prom_debug(x...)	do { } while (0)
+#endif
+
+
+typedef u32 prom_arg_t;
+
+struct prom_args {
+        __be32 service;
+        __be32 nargs;
+        __be32 nret;
+        __be32 args[10];
+};
+
+struct prom_t {
+	ihandle root;
+	phandle chosen;
+	int cpu;
+	ihandle stdout;
+	ihandle mmumap;
+	ihandle memory;
+};
+
+struct mem_map_entry {
+	__be64	base;
+	__be64	size;
+};
+
+typedef __be32 cell_t;
+
+extern void __start(unsigned long r3, unsigned long r4, unsigned long r5,
+		    unsigned long r6, unsigned long r7, unsigned long r8,
+		    unsigned long r9);
+
+#ifdef CONFIG_PPC64
+extern int enter_prom(struct prom_args *args, unsigned long entry);
+#else
+static inline int enter_prom(struct prom_args *args, unsigned long entry)
+{
+	return ((int (*)(struct prom_args *))entry)(args);
+}
+#endif
+
+extern void copy_and_flush(unsigned long dest, unsigned long src,
+			   unsigned long size, unsigned long offset);
+
+/* prom structure */
+static struct prom_t __initdata prom;
+
+static unsigned long prom_entry __initdata;
+
+#define PROM_SCRATCH_SIZE 256
+
+static char __initdata of_stdout_device[256];
+static char __initdata prom_scratch[PROM_SCRATCH_SIZE];
+
+static unsigned long __initdata dt_header_start;
+static unsigned long __initdata dt_struct_start, dt_struct_end;
+static unsigned long __initdata dt_string_start, dt_string_end;
+
+static unsigned long __initdata prom_initrd_start, prom_initrd_end;
+
+#ifdef CONFIG_PPC64
+static int __initdata prom_iommu_force_on;
+static int __initdata prom_iommu_off;
+static unsigned long __initdata prom_tce_alloc_start;
+static unsigned long __initdata prom_tce_alloc_end;
+#endif
+
+static bool prom_radix_disable __initdata = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
+
+struct platform_support {
+	bool hash_mmu;
+	bool radix_mmu;
+	bool radix_gtse;
+	bool xive;
+};
+
+/* Platforms codes are now obsolete in the kernel. Now only used within this
+ * file and ultimately gone too. Feel free to change them if you need, they
+ * are not shared with anything outside of this file anymore
+ */
+#define PLATFORM_PSERIES	0x0100
+#define PLATFORM_PSERIES_LPAR	0x0101
+#define PLATFORM_LPAR		0x0001
+#define PLATFORM_POWERMAC	0x0400
+#define PLATFORM_GENERIC	0x0500
+#define PLATFORM_OPAL		0x0600
+
+static int __initdata of_platform;
+
+static char __initdata prom_cmd_line[COMMAND_LINE_SIZE];
+
+static unsigned long __initdata prom_memory_limit;
+
+static unsigned long __initdata alloc_top;
+static unsigned long __initdata alloc_top_high;
+static unsigned long __initdata alloc_bottom;
+static unsigned long __initdata rmo_top;
+static unsigned long __initdata ram_top;
+
+static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE];
+static int __initdata mem_reserve_cnt;
+
+static cell_t __initdata regbuf[1024];
+
+static bool rtas_has_query_cpu_stopped;
+
+
+/*
+ * Error results ... some OF calls will return "-1" on error, some
+ * will return 0, some will return either. To simplify, here are
+ * macros to use with any ihandle or phandle return value to check if
+ * it is valid
+ */
+
+#define PROM_ERROR		(-1u)
+#define PHANDLE_VALID(p)	((p) != 0 && (p) != PROM_ERROR)
+#define IHANDLE_VALID(i)	((i) != 0 && (i) != PROM_ERROR)
+
+
+/* This is the one and *ONLY* place where we actually call open
+ * firmware.
+ */
+
+static int __init call_prom(const char *service, int nargs, int nret, ...)
+{
+	int i;
+	struct prom_args args;
+	va_list list;
+
+	args.service = cpu_to_be32(ADDR(service));
+	args.nargs = cpu_to_be32(nargs);
+	args.nret = cpu_to_be32(nret);
+
+	va_start(list, nret);
+	for (i = 0; i < nargs; i++)
+		args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t));
+	va_end(list);
+
+	for (i = 0; i < nret; i++)
+		args.args[nargs+i] = 0;
+
+	if (enter_prom(&args, prom_entry) < 0)
+		return PROM_ERROR;
+
+	return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0;
+}
+
+static int __init call_prom_ret(const char *service, int nargs, int nret,
+				prom_arg_t *rets, ...)
+{
+	int i;
+	struct prom_args args;
+	va_list list;
+
+	args.service = cpu_to_be32(ADDR(service));
+	args.nargs = cpu_to_be32(nargs);
+	args.nret = cpu_to_be32(nret);
+
+	va_start(list, rets);
+	for (i = 0; i < nargs; i++)
+		args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t));
+	va_end(list);
+
+	for (i = 0; i < nret; i++)
+		args.args[nargs+i] = 0;
+
+	if (enter_prom(&args, prom_entry) < 0)
+		return PROM_ERROR;
+
+	if (rets != NULL)
+		for (i = 1; i < nret; ++i)
+			rets[i-1] = be32_to_cpu(args.args[nargs+i]);
+
+	return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0;
+}
+
+
+static void __init prom_print(const char *msg)
+{
+	const char *p, *q;
+
+	if (prom.stdout == 0)
+		return;
+
+	for (p = msg; *p != 0; p = q) {
+		for (q = p; *q != 0 && *q != '\n'; ++q)
+			;
+		if (q > p)
+			call_prom("write", 3, 1, prom.stdout, p, q - p);
+		if (*q == 0)
+			break;
+		++q;
+		call_prom("write", 3, 1, prom.stdout, ADDR("\r\n"), 2);
+	}
+}
+
+
+/*
+ * Both prom_print_hex & prom_print_dec takes an unsigned long as input so that
+ * we do not need __udivdi3 or __umoddi3 on 32bits.
+ */
+static void __init prom_print_hex(unsigned long val)
+{
+	int i, nibbles = sizeof(val)*2;
+	char buf[sizeof(val)*2+1];
+
+	for (i = nibbles-1;  i >= 0;  i--) {
+		buf[i] = (val & 0xf) + '0';
+		if (buf[i] > '9')
+			buf[i] += ('a'-'0'-10);
+		val >>= 4;
+	}
+	buf[nibbles] = '\0';
+	call_prom("write", 3, 1, prom.stdout, buf, nibbles);
+}
+
+/* max number of decimal digits in an unsigned long */
+#define UL_DIGITS 21
+static void __init prom_print_dec(unsigned long val)
+{
+	int i, size;
+	char buf[UL_DIGITS+1];
+
+	for (i = UL_DIGITS-1; i >= 0;  i--) {
+		buf[i] = (val % 10) + '0';
+		val = val/10;
+		if (val == 0)
+			break;
+	}
+	/* shift stuff down */
+	size = UL_DIGITS - i;
+	call_prom("write", 3, 1, prom.stdout, buf+i, size);
+}
+
+__printf(1, 2)
+static void __init prom_printf(const char *format, ...)
+{
+	const char *p, *q, *s;
+	va_list args;
+	unsigned long v;
+	long vs;
+	int n = 0;
+
+	va_start(args, format);
+	for (p = format; *p != 0; p = q) {
+		for (q = p; *q != 0 && *q != '\n' && *q != '%'; ++q)
+			;
+		if (q > p)
+			call_prom("write", 3, 1, prom.stdout, p, q - p);
+		if (*q == 0)
+			break;
+		if (*q == '\n') {
+			++q;
+			call_prom("write", 3, 1, prom.stdout,
+				  ADDR("\r\n"), 2);
+			continue;
+		}
+		++q;
+		if (*q == 0)
+			break;
+		while (*q == 'l') {
+			++q;
+			++n;
+		}
+		switch (*q) {
+		case 's':
+			++q;
+			s = va_arg(args, const char *);
+			prom_print(s);
+			break;
+		case 'x':
+			++q;
+			switch (n) {
+			case 0:
+				v = va_arg(args, unsigned int);
+				break;
+			case 1:
+				v = va_arg(args, unsigned long);
+				break;
+			case 2:
+			default:
+				v = va_arg(args, unsigned long long);
+				break;
+			}
+			prom_print_hex(v);
+			break;
+		case 'u':
+			++q;
+			switch (n) {
+			case 0:
+				v = va_arg(args, unsigned int);
+				break;
+			case 1:
+				v = va_arg(args, unsigned long);
+				break;
+			case 2:
+			default:
+				v = va_arg(args, unsigned long long);
+				break;
+			}
+			prom_print_dec(v);
+			break;
+		case 'd':
+			++q;
+			switch (n) {
+			case 0:
+				vs = va_arg(args, int);
+				break;
+			case 1:
+				vs = va_arg(args, long);
+				break;
+			case 2:
+			default:
+				vs = va_arg(args, long long);
+				break;
+			}
+			if (vs < 0) {
+				prom_print("-");
+				vs = -vs;
+			}
+			prom_print_dec(vs);
+			break;
+		}
+	}
+	va_end(args);
+}
+
+
+static unsigned int __init prom_claim(unsigned long virt, unsigned long size,
+				unsigned long align)
+{
+
+	if (align == 0 && (OF_WORKAROUNDS & OF_WA_CLAIM)) {
+		/*
+		 * Old OF requires we claim physical and virtual separately
+		 * and then map explicitly (assuming virtual mode)
+		 */
+		int ret;
+		prom_arg_t result;
+
+		ret = call_prom_ret("call-method", 5, 2, &result,
+				    ADDR("claim"), prom.memory,
+				    align, size, virt);
+		if (ret != 0 || result == -1)
+			return -1;
+		ret = call_prom_ret("call-method", 5, 2, &result,
+				    ADDR("claim"), prom.mmumap,
+				    align, size, virt);
+		if (ret != 0) {
+			call_prom("call-method", 4, 1, ADDR("release"),
+				  prom.memory, size, virt);
+			return -1;
+		}
+		/* the 0x12 is M (coherence) + PP == read/write */
+		call_prom("call-method", 6, 1,
+			  ADDR("map"), prom.mmumap, 0x12, size, virt, virt);
+		return virt;
+	}
+	return call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size,
+			 (prom_arg_t)align);
+}
+
+static void __init __attribute__((noreturn)) prom_panic(const char *reason)
+{
+	prom_print(reason);
+	/* Do not call exit because it clears the screen on pmac
+	 * it also causes some sort of double-fault on early pmacs */
+	if (of_platform == PLATFORM_POWERMAC)
+		asm("trap\n");
+
+	/* ToDo: should put up an SRC here on pSeries */
+	call_prom("exit", 0, 0);
+
+	for (;;)			/* should never get here */
+		;
+}
+
+
+static int __init prom_next_node(phandle *nodep)
+{
+	phandle node;
+
+	if ((node = *nodep) != 0
+	    && (*nodep = call_prom("child", 1, 1, node)) != 0)
+		return 1;
+	if ((*nodep = call_prom("peer", 1, 1, node)) != 0)
+		return 1;
+	for (;;) {
+		if ((node = call_prom("parent", 1, 1, node)) == 0)
+			return 0;
+		if ((*nodep = call_prom("peer", 1, 1, node)) != 0)
+			return 1;
+	}
+}
+
+static inline int prom_getprop(phandle node, const char *pname,
+			       void *value, size_t valuelen)
+{
+	return call_prom("getprop", 4, 1, node, ADDR(pname),
+			 (u32)(unsigned long) value, (u32) valuelen);
+}
+
+static inline int prom_getproplen(phandle node, const char *pname)
+{
+	return call_prom("getproplen", 2, 1, node, ADDR(pname));
+}
+
+static void add_string(char **str, const char *q)
+{
+	char *p = *str;
+
+	while (*q)
+		*p++ = *q++;
+	*p++ = ' ';
+	*str = p;
+}
+
+static char *tohex(unsigned int x)
+{
+	static char digits[] = "0123456789abcdef";
+	static char result[9];
+	int i;
+
+	result[8] = 0;
+	i = 8;
+	do {
+		--i;
+		result[i] = digits[x & 0xf];
+		x >>= 4;
+	} while (x != 0 && i > 0);
+	return &result[i];
+}
+
+static int __init prom_setprop(phandle node, const char *nodename,
+			       const char *pname, void *value, size_t valuelen)
+{
+	char cmd[256], *p;
+
+	if (!(OF_WORKAROUNDS & OF_WA_LONGTRAIL))
+		return call_prom("setprop", 4, 1, node, ADDR(pname),
+				 (u32)(unsigned long) value, (u32) valuelen);
+
+	/* gah... setprop doesn't work on longtrail, have to use interpret */
+	p = cmd;
+	add_string(&p, "dev");
+	add_string(&p, nodename);
+	add_string(&p, tohex((u32)(unsigned long) value));
+	add_string(&p, tohex(valuelen));
+	add_string(&p, tohex(ADDR(pname)));
+	add_string(&p, tohex(strlen(pname)));
+	add_string(&p, "property");
+	*p = 0;
+	return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd);
+}
+
+/* We can't use the standard versions because of relocation headaches. */
+#define isxdigit(c)	(('0' <= (c) && (c) <= '9') \
+			 || ('a' <= (c) && (c) <= 'f') \
+			 || ('A' <= (c) && (c) <= 'F'))
+
+#define isdigit(c)	('0' <= (c) && (c) <= '9')
+#define islower(c)	('a' <= (c) && (c) <= 'z')
+#define toupper(c)	(islower(c) ? ((c) - 'a' + 'A') : (c))
+
+static unsigned long prom_strtoul(const char *cp, const char **endp)
+{
+	unsigned long result = 0, base = 10, value;
+
+	if (*cp == '0') {
+		base = 8;
+		cp++;
+		if (toupper(*cp) == 'X') {
+			cp++;
+			base = 16;
+		}
+	}
+
+	while (isxdigit(*cp) &&
+	       (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) {
+		result = result * base + value;
+		cp++;
+	}
+
+	if (endp)
+		*endp = cp;
+
+	return result;
+}
+
+static unsigned long prom_memparse(const char *ptr, const char **retptr)
+{
+	unsigned long ret = prom_strtoul(ptr, retptr);
+	int shift = 0;
+
+	/*
+	 * We can't use a switch here because GCC *may* generate a
+	 * jump table which won't work, because we're not running at
+	 * the address we're linked at.
+	 */
+	if ('G' == **retptr || 'g' == **retptr)
+		shift = 30;
+
+	if ('M' == **retptr || 'm' == **retptr)
+		shift = 20;
+
+	if ('K' == **retptr || 'k' == **retptr)
+		shift = 10;
+
+	if (shift) {
+		ret <<= shift;
+		(*retptr)++;
+	}
+
+	return ret;
+}
+
+/*
+ * Early parsing of the command line passed to the kernel, used for
+ * "mem=x" and the options that affect the iommu
+ */
+static void __init early_cmdline_parse(void)
+{
+	const char *opt;
+
+	char *p;
+	int l __maybe_unused = 0;
+
+	prom_cmd_line[0] = 0;
+	p = prom_cmd_line;
+	if ((long)prom.chosen > 0)
+		l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1);
+#ifdef CONFIG_CMDLINE
+	if (l <= 0 || p[0] == '\0') /* dbl check */
+		strlcpy(prom_cmd_line,
+			CONFIG_CMDLINE, sizeof(prom_cmd_line));
+#endif /* CONFIG_CMDLINE */
+	prom_printf("command line: %s\n", prom_cmd_line);
+
+#ifdef CONFIG_PPC64
+	opt = strstr(prom_cmd_line, "iommu=");
+	if (opt) {
+		prom_printf("iommu opt is: %s\n", opt);
+		opt += 6;
+		while (*opt && *opt == ' ')
+			opt++;
+		if (!strncmp(opt, "off", 3))
+			prom_iommu_off = 1;
+		else if (!strncmp(opt, "force", 5))
+			prom_iommu_force_on = 1;
+	}
+#endif
+	opt = strstr(prom_cmd_line, "mem=");
+	if (opt) {
+		opt += 4;
+		prom_memory_limit = prom_memparse(opt, (const char **)&opt);
+#ifdef CONFIG_PPC64
+		/* Align to 16 MB == size of ppc64 large page */
+		prom_memory_limit = ALIGN(prom_memory_limit, 0x1000000);
+#endif
+	}
+
+	opt = strstr(prom_cmd_line, "disable_radix");
+	if (opt) {
+		opt += 13;
+		if (*opt && *opt == '=') {
+			bool val;
+
+			if (kstrtobool(++opt, &val))
+				prom_radix_disable = false;
+			else
+				prom_radix_disable = val;
+		} else
+			prom_radix_disable = true;
+	}
+	if (prom_radix_disable)
+		prom_debug("Radix disabled from cmdline\n");
+}
+
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+/*
+ * The architecture vector has an array of PVR mask/value pairs,
+ * followed by # option vectors - 1, followed by the option vectors.
+ *
+ * See prom.h for the definition of the bits specified in the
+ * architecture vector.
+ */
+
+/* Firmware expects the value to be n - 1, where n is the # of vectors */
+#define NUM_VECTORS(n)		((n) - 1)
+
+/*
+ * Firmware expects 1 + n - 2, where n is the length of the option vector in
+ * bytes. The 1 accounts for the length byte itself, the - 2 .. ?
+ */
+#define VECTOR_LENGTH(n)	(1 + (n) - 2)
+
+struct option_vector1 {
+	u8 byte1;
+	u8 arch_versions;
+	u8 arch_versions3;
+} __packed;
+
+struct option_vector2 {
+	u8 byte1;
+	__be16 reserved;
+	__be32 real_base;
+	__be32 real_size;
+	__be32 virt_base;
+	__be32 virt_size;
+	__be32 load_base;
+	__be32 min_rma;
+	__be32 min_load;
+	u8 min_rma_percent;
+	u8 max_pft_size;
+} __packed;
+
+struct option_vector3 {
+	u8 byte1;
+	u8 byte2;
+} __packed;
+
+struct option_vector4 {
+	u8 byte1;
+	u8 min_vp_cap;
+} __packed;
+
+struct option_vector5 {
+	u8 byte1;
+	u8 byte2;
+	u8 byte3;
+	u8 cmo;
+	u8 associativity;
+	u8 bin_opts;
+	u8 micro_checkpoint;
+	u8 reserved0;
+	__be32 max_cpus;
+	__be16 papr_level;
+	__be16 reserved1;
+	u8 platform_facilities;
+	u8 reserved2;
+	__be16 reserved3;
+	u8 subprocessors;
+	u8 byte22;
+	u8 intarch;
+	u8 mmu;
+	u8 hash_ext;
+	u8 radix_ext;
+} __packed;
+
+struct option_vector6 {
+	u8 reserved;
+	u8 secondary_pteg;
+	u8 os_name;
+} __packed;
+
+struct ibm_arch_vec {
+	struct { u32 mask, val; } pvrs[12];
+
+	u8 num_vectors;
+
+	u8 vec1_len;
+	struct option_vector1 vec1;
+
+	u8 vec2_len;
+	struct option_vector2 vec2;
+
+	u8 vec3_len;
+	struct option_vector3 vec3;
+
+	u8 vec4_len;
+	struct option_vector4 vec4;
+
+	u8 vec5_len;
+	struct option_vector5 vec5;
+
+	u8 vec6_len;
+	struct option_vector6 vec6;
+} __packed;
+
+struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
+	.pvrs = {
+		{
+			.mask = cpu_to_be32(0xfffe0000), /* POWER5/POWER5+ */
+			.val  = cpu_to_be32(0x003a0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER6 */
+			.val  = cpu_to_be32(0x003e0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER7 */
+			.val  = cpu_to_be32(0x003f0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER8E */
+			.val  = cpu_to_be32(0x004b0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER8NVL */
+			.val  = cpu_to_be32(0x004c0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER8 */
+			.val  = cpu_to_be32(0x004d0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER9 */
+			.val  = cpu_to_be32(0x004e0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */
+			.val  = cpu_to_be32(0x0f000005),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */
+			.val  = cpu_to_be32(0x0f000004),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 2.06-compliant */
+			.val  = cpu_to_be32(0x0f000003),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 2.05-compliant */
+			.val  = cpu_to_be32(0x0f000002),
+		},
+		{
+			.mask = cpu_to_be32(0xfffffffe), /* all 2.04-compliant and earlier */
+			.val  = cpu_to_be32(0x0f000001),
+		},
+	},
+
+	.num_vectors = NUM_VECTORS(6),
+
+	.vec1_len = VECTOR_LENGTH(sizeof(struct option_vector1)),
+	.vec1 = {
+		.byte1 = 0,
+		.arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
+				 OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
+		.arch_versions3 = OV1_PPC_3_00,
+	},
+
+	.vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)),
+	/* option vector 2: Open Firmware options supported */
+	.vec2 = {
+		.byte1 = OV2_REAL_MODE,
+		.reserved = 0,
+		.real_base = cpu_to_be32(0xffffffff),
+		.real_size = cpu_to_be32(0xffffffff),
+		.virt_base = cpu_to_be32(0xffffffff),
+		.virt_size = cpu_to_be32(0xffffffff),
+		.load_base = cpu_to_be32(0xffffffff),
+		.min_rma = cpu_to_be32(512),		/* 512MB min RMA */
+		.min_load = cpu_to_be32(0xffffffff),	/* full client load */
+		.min_rma_percent = 0,	/* min RMA percentage of total RAM */
+		.max_pft_size = 48,	/* max log_2(hash table size) */
+	},
+
+	.vec3_len = VECTOR_LENGTH(sizeof(struct option_vector3)),
+	/* option vector 3: processor options supported */
+	.vec3 = {
+		.byte1 = 0,			/* don't ignore, don't halt */
+		.byte2 = OV3_FP | OV3_VMX | OV3_DFP,
+	},
+
+	.vec4_len = VECTOR_LENGTH(sizeof(struct option_vector4)),
+	/* option vector 4: IBM PAPR implementation */
+	.vec4 = {
+		.byte1 = 0,			/* don't halt */
+		.min_vp_cap = OV4_MIN_ENT_CAP,	/* minimum VP entitled capacity */
+	},
+
+	.vec5_len = VECTOR_LENGTH(sizeof(struct option_vector5)),
+	/* option vector 5: PAPR/OF options */
+	.vec5 = {
+		.byte1 = 0,				/* don't ignore, don't halt */
+		.byte2 = OV5_FEAT(OV5_LPAR) | OV5_FEAT(OV5_SPLPAR) | OV5_FEAT(OV5_LARGE_PAGES) |
+		OV5_FEAT(OV5_DRCONF_MEMORY) | OV5_FEAT(OV5_DONATE_DEDICATE_CPU) |
+#ifdef CONFIG_PCI_MSI
+		/* PCIe/MSI support.  Without MSI full PCIe is not supported */
+		OV5_FEAT(OV5_MSI),
+#else
+		0,
+#endif
+		.byte3 = 0,
+		.cmo =
+#ifdef CONFIG_PPC_SMLPAR
+		OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO),
+#else
+		0,
+#endif
+		.associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
+		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
+		.micro_checkpoint = 0,
+		.reserved0 = 0,
+		.max_cpus = cpu_to_be32(NR_CPUS),	/* number of cores supported */
+		.papr_level = 0,
+		.reserved1 = 0,
+		.platform_facilities = OV5_FEAT(OV5_PFO_HW_RNG) | OV5_FEAT(OV5_PFO_HW_ENCR) | OV5_FEAT(OV5_PFO_HW_842),
+		.reserved2 = 0,
+		.reserved3 = 0,
+		.subprocessors = 1,
+		.byte22 = OV5_FEAT(OV5_DRMEM_V2) | OV5_FEAT(OV5_DRC_INFO),
+		.intarch = 0,
+		.mmu = 0,
+		.hash_ext = 0,
+		.radix_ext = 0,
+	},
+
+	/* option vector 6: IBM PAPR hints */
+	.vec6_len = VECTOR_LENGTH(sizeof(struct option_vector6)),
+	.vec6 = {
+		.reserved = 0,
+		.secondary_pteg = 0,
+		.os_name = OV6_LINUX,
+	},
+};
+
+/* Old method - ELF header with PT_NOTE sections only works on BE */
+#ifdef __BIG_ENDIAN__
+static struct fake_elf {
+	Elf32_Ehdr	elfhdr;
+	Elf32_Phdr	phdr[2];
+	struct chrpnote {
+		u32	namesz;
+		u32	descsz;
+		u32	type;
+		char	name[8];	/* "PowerPC" */
+		struct chrpdesc {
+			u32	real_mode;
+			u32	real_base;
+			u32	real_size;
+			u32	virt_base;
+			u32	virt_size;
+			u32	load_base;
+		} chrpdesc;
+	} chrpnote;
+	struct rpanote {
+		u32	namesz;
+		u32	descsz;
+		u32	type;
+		char	name[24];	/* "IBM,RPA-Client-Config" */
+		struct rpadesc {
+			u32	lpar_affinity;
+			u32	min_rmo_size;
+			u32	min_rmo_percent;
+			u32	max_pft_size;
+			u32	splpar;
+			u32	min_load;
+			u32	new_mem_def;
+			u32	ignore_me;
+		} rpadesc;
+	} rpanote;
+} fake_elf = {
+	.elfhdr = {
+		.e_ident = { 0x7f, 'E', 'L', 'F',
+			     ELFCLASS32, ELFDATA2MSB, EV_CURRENT },
+		.e_type = ET_EXEC,	/* yeah right */
+		.e_machine = EM_PPC,
+		.e_version = EV_CURRENT,
+		.e_phoff = offsetof(struct fake_elf, phdr),
+		.e_phentsize = sizeof(Elf32_Phdr),
+		.e_phnum = 2
+	},
+	.phdr = {
+		[0] = {
+			.p_type = PT_NOTE,
+			.p_offset = offsetof(struct fake_elf, chrpnote),
+			.p_filesz = sizeof(struct chrpnote)
+		}, [1] = {
+			.p_type = PT_NOTE,
+			.p_offset = offsetof(struct fake_elf, rpanote),
+			.p_filesz = sizeof(struct rpanote)
+		}
+	},
+	.chrpnote = {
+		.namesz = sizeof("PowerPC"),
+		.descsz = sizeof(struct chrpdesc),
+		.type = 0x1275,
+		.name = "PowerPC",
+		.chrpdesc = {
+			.real_mode = ~0U,	/* ~0 means "don't care" */
+			.real_base = ~0U,
+			.real_size = ~0U,
+			.virt_base = ~0U,
+			.virt_size = ~0U,
+			.load_base = ~0U
+		},
+	},
+	.rpanote = {
+		.namesz = sizeof("IBM,RPA-Client-Config"),
+		.descsz = sizeof(struct rpadesc),
+		.type = 0x12759999,
+		.name = "IBM,RPA-Client-Config",
+		.rpadesc = {
+			.lpar_affinity = 0,
+			.min_rmo_size = 64,	/* in megabytes */
+			.min_rmo_percent = 0,
+			.max_pft_size = 48,	/* 2^48 bytes max PFT size */
+			.splpar = 1,
+			.min_load = ~0U,
+			.new_mem_def = 0
+		}
+	}
+};
+#endif /* __BIG_ENDIAN__ */
+
+static int __init prom_count_smt_threads(void)
+{
+	phandle node;
+	char type[64];
+	unsigned int plen;
+
+	/* Pick up th first CPU node we can find */
+	for (node = 0; prom_next_node(&node); ) {
+		type[0] = 0;
+		prom_getprop(node, "device_type", type, sizeof(type));
+
+		if (strcmp(type, "cpu"))
+			continue;
+		/*
+		 * There is an entry for each smt thread, each entry being
+		 * 4 bytes long.  All cpus should have the same number of
+		 * smt threads, so return after finding the first.
+		 */
+		plen = prom_getproplen(node, "ibm,ppc-interrupt-server#s");
+		if (plen == PROM_ERROR)
+			break;
+		plen >>= 2;
+		prom_debug("Found %lu smt threads per core\n", (unsigned long)plen);
+
+		/* Sanity check */
+		if (plen < 1 || plen > 64) {
+			prom_printf("Threads per core %lu out of bounds, assuming 1\n",
+				    (unsigned long)plen);
+			return 1;
+		}
+		return plen;
+	}
+	prom_debug("No threads found, assuming 1 per core\n");
+
+	return 1;
+
+}
+
+static void __init prom_parse_mmu_model(u8 val,
+					struct platform_support *support)
+{
+	switch (val) {
+	case OV5_FEAT(OV5_MMU_DYNAMIC):
+	case OV5_FEAT(OV5_MMU_EITHER): /* Either Available */
+		prom_debug("MMU - either supported\n");
+		support->radix_mmu = !prom_radix_disable;
+		support->hash_mmu = true;
+		break;
+	case OV5_FEAT(OV5_MMU_RADIX): /* Only Radix */
+		prom_debug("MMU - radix only\n");
+		if (prom_radix_disable) {
+			/*
+			 * If we __have__ to do radix, we're better off ignoring
+			 * the command line rather than not booting.
+			 */
+			prom_printf("WARNING: Ignoring cmdline option disable_radix\n");
+		}
+		support->radix_mmu = true;
+		break;
+	case OV5_FEAT(OV5_MMU_HASH):
+		prom_debug("MMU - hash only\n");
+		support->hash_mmu = true;
+		break;
+	default:
+		prom_debug("Unknown mmu support option: 0x%x\n", val);
+		break;
+	}
+}
+
+static void __init prom_parse_xive_model(u8 val,
+					 struct platform_support *support)
+{
+	switch (val) {
+	case OV5_FEAT(OV5_XIVE_EITHER): /* Either Available */
+		prom_debug("XIVE - either mode supported\n");
+		support->xive = true;
+		break;
+	case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */
+		prom_debug("XIVE - exploitation mode supported\n");
+		support->xive = true;
+		break;
+	case OV5_FEAT(OV5_XIVE_LEGACY): /* Only Legacy mode */
+		prom_debug("XIVE - legacy mode supported\n");
+		break;
+	default:
+		prom_debug("Unknown xive support option: 0x%x\n", val);
+		break;
+	}
+}
+
+static void __init prom_parse_platform_support(u8 index, u8 val,
+					       struct platform_support *support)
+{
+	switch (index) {
+	case OV5_INDX(OV5_MMU_SUPPORT): /* MMU Model */
+		prom_parse_mmu_model(val & OV5_FEAT(OV5_MMU_SUPPORT), support);
+		break;
+	case OV5_INDX(OV5_RADIX_GTSE): /* Radix Extensions */
+		if (val & OV5_FEAT(OV5_RADIX_GTSE)) {
+			prom_debug("Radix - GTSE supported\n");
+			support->radix_gtse = true;
+		}
+		break;
+	case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */
+		prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT),
+				      support);
+		break;
+	}
+}
+
+static void __init prom_check_platform_support(void)
+{
+	struct platform_support supported = {
+		.hash_mmu = false,
+		.radix_mmu = false,
+		.radix_gtse = false,
+		.xive = false
+	};
+	int prop_len = prom_getproplen(prom.chosen,
+				       "ibm,arch-vec-5-platform-support");
+	if (prop_len > 1) {
+		int i;
+		u8 vec[prop_len];
+		prom_debug("Found ibm,arch-vec-5-platform-support, len: %d\n",
+			   prop_len);
+		prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support",
+			     &vec, sizeof(vec));
+		for (i = 0; i < prop_len; i += 2) {
+			prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2
+								  , vec[i]
+								  , vec[i + 1]);
+			prom_parse_platform_support(vec[i], vec[i + 1],
+						    &supported);
+		}
+	}
+
+	if (supported.radix_mmu && supported.radix_gtse &&
+	    IS_ENABLED(CONFIG_PPC_RADIX_MMU)) {
+		/* Radix preferred - but we require GTSE for now */
+		prom_debug("Asking for radix with GTSE\n");
+		ibm_architecture_vec.vec5.mmu = OV5_FEAT(OV5_MMU_RADIX);
+		ibm_architecture_vec.vec5.radix_ext = OV5_FEAT(OV5_RADIX_GTSE);
+	} else if (supported.hash_mmu) {
+		/* Default to hash mmu (if we can) */
+		prom_debug("Asking for hash\n");
+		ibm_architecture_vec.vec5.mmu = OV5_FEAT(OV5_MMU_HASH);
+	} else {
+		/* We're probably on a legacy hypervisor */
+		prom_debug("Assuming legacy hash support\n");
+	}
+
+	if (supported.xive) {
+		prom_debug("Asking for XIVE\n");
+		ibm_architecture_vec.vec5.intarch = OV5_FEAT(OV5_XIVE_EXPLOIT);
+	}
+}
+
+static void __init prom_send_capabilities(void)
+{
+	ihandle root;
+	prom_arg_t ret;
+	u32 cores;
+
+	/* Check ibm,arch-vec-5-platform-support and fixup vec5 if required */
+	prom_check_platform_support();
+
+	root = call_prom("open", 1, 1, ADDR("/"));
+	if (root != 0) {
+		/* We need to tell the FW about the number of cores we support.
+		 *
+		 * To do that, we count the number of threads on the first core
+		 * (we assume this is the same for all cores) and use it to
+		 * divide NR_CPUS.
+		 */
+
+		cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads());
+		prom_printf("Max number of cores passed to firmware: %u (NR_CPUS = %d)\n",
+			    cores, NR_CPUS);
+
+		ibm_architecture_vec.vec5.max_cpus = cpu_to_be32(cores);
+
+		/* try calling the ibm,client-architecture-support method */
+		prom_printf("Calling ibm,client-architecture-support...");
+		if (call_prom_ret("call-method", 3, 2, &ret,
+				  ADDR("ibm,client-architecture-support"),
+				  root,
+				  ADDR(&ibm_architecture_vec)) == 0) {
+			/* the call exists... */
+			if (ret)
+				prom_printf("\nWARNING: ibm,client-architecture"
+					    "-support call FAILED!\n");
+			call_prom("close", 1, 0, root);
+			prom_printf(" done\n");
+			return;
+		}
+		call_prom("close", 1, 0, root);
+		prom_printf(" not implemented\n");
+	}
+
+#ifdef __BIG_ENDIAN__
+	{
+		ihandle elfloader;
+
+		/* no ibm,client-architecture-support call, try the old way */
+		elfloader = call_prom("open", 1, 1,
+				      ADDR("/packages/elf-loader"));
+		if (elfloader == 0) {
+			prom_printf("couldn't open /packages/elf-loader\n");
+			return;
+		}
+		call_prom("call-method", 3, 1, ADDR("process-elf-header"),
+			  elfloader, ADDR(&fake_elf));
+		call_prom("close", 1, 0, elfloader);
+	}
+#endif /* __BIG_ENDIAN__ */
+}
+#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
+
+/*
+ * Memory allocation strategy... our layout is normally:
+ *
+ *  at 14Mb or more we have vmlinux, then a gap and initrd.  In some
+ *  rare cases, initrd might end up being before the kernel though.
+ *  We assume this won't override the final kernel at 0, we have no
+ *  provision to handle that in this version, but it should hopefully
+ *  never happen.
+ *
+ *  alloc_top is set to the top of RMO, eventually shrink down if the
+ *  TCEs overlap
+ *
+ *  alloc_bottom is set to the top of kernel/initrd
+ *
+ *  from there, allocations are done this way : rtas is allocated
+ *  topmost, and the device-tree is allocated from the bottom. We try
+ *  to grow the device-tree allocation as we progress. If we can't,
+ *  then we fail, we don't currently have a facility to restart
+ *  elsewhere, but that shouldn't be necessary.
+ *
+ *  Note that calls to reserve_mem have to be done explicitly, memory
+ *  allocated with either alloc_up or alloc_down isn't automatically
+ *  reserved.
+ */
+
+
+/*
+ * Allocates memory in the RMO upward from the kernel/initrd
+ *
+ * When align is 0, this is a special case, it means to allocate in place
+ * at the current location of alloc_bottom or fail (that is basically
+ * extending the previous allocation). Used for the device-tree flattening
+ */
+static unsigned long __init alloc_up(unsigned long size, unsigned long align)
+{
+	unsigned long base = alloc_bottom;
+	unsigned long addr = 0;
+
+	if (align)
+		base = _ALIGN_UP(base, align);
+	prom_debug("%s(%lx, %lx)\n", __func__, size, align);
+	if (ram_top == 0)
+		prom_panic("alloc_up() called with mem not initialized\n");
+
+	if (align)
+		base = _ALIGN_UP(alloc_bottom, align);
+	else
+		base = alloc_bottom;
+
+	for(; (base + size) <= alloc_top; 
+	    base = _ALIGN_UP(base + 0x100000, align)) {
+		prom_debug("    trying: 0x%lx\n\r", base);
+		addr = (unsigned long)prom_claim(base, size, 0);
+		if (addr != PROM_ERROR && addr != 0)
+			break;
+		addr = 0;
+		if (align == 0)
+			break;
+	}
+	if (addr == 0)
+		return 0;
+	alloc_bottom = addr + size;
+
+	prom_debug(" -> %lx\n", addr);
+	prom_debug("  alloc_bottom : %lx\n", alloc_bottom);
+	prom_debug("  alloc_top    : %lx\n", alloc_top);
+	prom_debug("  alloc_top_hi : %lx\n", alloc_top_high);
+	prom_debug("  rmo_top      : %lx\n", rmo_top);
+	prom_debug("  ram_top      : %lx\n", ram_top);
+
+	return addr;
+}
+
+/*
+ * Allocates memory downward, either from top of RMO, or if highmem
+ * is set, from the top of RAM.  Note that this one doesn't handle
+ * failures.  It does claim memory if highmem is not set.
+ */
+static unsigned long __init alloc_down(unsigned long size, unsigned long align,
+				       int highmem)
+{
+	unsigned long base, addr = 0;
+
+	prom_debug("%s(%lx, %lx, %s)\n", __func__, size, align,
+		   highmem ? "(high)" : "(low)");
+	if (ram_top == 0)
+		prom_panic("alloc_down() called with mem not initialized\n");
+
+	if (highmem) {
+		/* Carve out storage for the TCE table. */
+		addr = _ALIGN_DOWN(alloc_top_high - size, align);
+		if (addr <= alloc_bottom)
+			return 0;
+		/* Will we bump into the RMO ? If yes, check out that we
+		 * didn't overlap existing allocations there, if we did,
+		 * we are dead, we must be the first in town !
+		 */
+		if (addr < rmo_top) {
+			/* Good, we are first */
+			if (alloc_top == rmo_top)
+				alloc_top = rmo_top = addr;
+			else
+				return 0;
+		}
+		alloc_top_high = addr;
+		goto bail;
+	}
+
+	base = _ALIGN_DOWN(alloc_top - size, align);
+	for (; base > alloc_bottom;
+	     base = _ALIGN_DOWN(base - 0x100000, align))  {
+		prom_debug("    trying: 0x%lx\n\r", base);
+		addr = (unsigned long)prom_claim(base, size, 0);
+		if (addr != PROM_ERROR && addr != 0)
+			break;
+		addr = 0;
+	}
+	if (addr == 0)
+		return 0;
+	alloc_top = addr;
+
+ bail:
+	prom_debug(" -> %lx\n", addr);
+	prom_debug("  alloc_bottom : %lx\n", alloc_bottom);
+	prom_debug("  alloc_top    : %lx\n", alloc_top);
+	prom_debug("  alloc_top_hi : %lx\n", alloc_top_high);
+	prom_debug("  rmo_top      : %lx\n", rmo_top);
+	prom_debug("  ram_top      : %lx\n", ram_top);
+
+	return addr;
+}
+
+/*
+ * Parse a "reg" cell
+ */
+static unsigned long __init prom_next_cell(int s, cell_t **cellp)
+{
+	cell_t *p = *cellp;
+	unsigned long r = 0;
+
+	/* Ignore more than 2 cells */
+	while (s > sizeof(unsigned long) / 4) {
+		p++;
+		s--;
+	}
+	r = be32_to_cpu(*p++);
+#ifdef CONFIG_PPC64
+	if (s > 1) {
+		r <<= 32;
+		r |= be32_to_cpu(*(p++));
+	}
+#endif
+	*cellp = p;
+	return r;
+}
+
+/*
+ * Very dumb function for adding to the memory reserve list, but
+ * we don't need anything smarter at this point
+ *
+ * XXX Eventually check for collisions.  They should NEVER happen.
+ * If problems seem to show up, it would be a good start to track
+ * them down.
+ */
+static void __init reserve_mem(u64 base, u64 size)
+{
+	u64 top = base + size;
+	unsigned long cnt = mem_reserve_cnt;
+
+	if (size == 0)
+		return;
+
+	/* We need to always keep one empty entry so that we
+	 * have our terminator with "size" set to 0 since we are
+	 * dumb and just copy this entire array to the boot params
+	 */
+	base = _ALIGN_DOWN(base, PAGE_SIZE);
+	top = _ALIGN_UP(top, PAGE_SIZE);
+	size = top - base;
+
+	if (cnt >= (MEM_RESERVE_MAP_SIZE - 1))
+		prom_panic("Memory reserve map exhausted !\n");
+	mem_reserve_map[cnt].base = cpu_to_be64(base);
+	mem_reserve_map[cnt].size = cpu_to_be64(size);
+	mem_reserve_cnt = cnt + 1;
+}
+
+/*
+ * Initialize memory allocation mechanism, parse "memory" nodes and
+ * obtain that way the top of memory and RMO to setup out local allocator
+ */
+static void __init prom_init_mem(void)
+{
+	phandle node;
+#ifdef DEBUG_PROM
+	char *path;
+#endif
+	char type[64];
+	unsigned int plen;
+	cell_t *p, *endp;
+	__be32 val;
+	u32 rac, rsc;
+
+	/*
+	 * We iterate the memory nodes to find
+	 * 1) top of RMO (first node)
+	 * 2) top of memory
+	 */
+	val = cpu_to_be32(2);
+	prom_getprop(prom.root, "#address-cells", &val, sizeof(val));
+	rac = be32_to_cpu(val);
+	val = cpu_to_be32(1);
+	prom_getprop(prom.root, "#size-cells", &val, sizeof(rsc));
+	rsc = be32_to_cpu(val);
+	prom_debug("root_addr_cells: %x\n", rac);
+	prom_debug("root_size_cells: %x\n", rsc);
+
+	prom_debug("scanning memory:\n");
+#ifdef DEBUG_PROM
+	path = prom_scratch;
+#endif
+
+	for (node = 0; prom_next_node(&node); ) {
+		type[0] = 0;
+		prom_getprop(node, "device_type", type, sizeof(type));
+
+		if (type[0] == 0) {
+			/*
+			 * CHRP Longtrail machines have no device_type
+			 * on the memory node, so check the name instead...
+			 */
+			prom_getprop(node, "name", type, sizeof(type));
+		}
+		if (strcmp(type, "memory"))
+			continue;
+
+		plen = prom_getprop(node, "reg", regbuf, sizeof(regbuf));
+		if (plen > sizeof(regbuf)) {
+			prom_printf("memory node too large for buffer !\n");
+			plen = sizeof(regbuf);
+		}
+		p = regbuf;
+		endp = p + (plen / sizeof(cell_t));
+
+#ifdef DEBUG_PROM
+		memset(path, 0, PROM_SCRATCH_SIZE);
+		call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
+		prom_debug("  node %s :\n", path);
+#endif /* DEBUG_PROM */
+
+		while ((endp - p) >= (rac + rsc)) {
+			unsigned long base, size;
+
+			base = prom_next_cell(rac, &p);
+			size = prom_next_cell(rsc, &p);
+
+			if (size == 0)
+				continue;
+			prom_debug("    %lx %lx\n", base, size);
+			if (base == 0 && (of_platform & PLATFORM_LPAR))
+				rmo_top = size;
+			if ((base + size) > ram_top)
+				ram_top = base + size;
+		}
+	}
+
+	alloc_bottom = PAGE_ALIGN((unsigned long)&_end + 0x4000);
+
+	/*
+	 * If prom_memory_limit is set we reduce the upper limits *except* for
+	 * alloc_top_high. This must be the real top of RAM so we can put
+	 * TCE's up there.
+	 */
+
+	alloc_top_high = ram_top;
+
+	if (prom_memory_limit) {
+		if (prom_memory_limit <= alloc_bottom) {
+			prom_printf("Ignoring mem=%lx <= alloc_bottom.\n",
+				    prom_memory_limit);
+			prom_memory_limit = 0;
+		} else if (prom_memory_limit >= ram_top) {
+			prom_printf("Ignoring mem=%lx >= ram_top.\n",
+				    prom_memory_limit);
+			prom_memory_limit = 0;
+		} else {
+			ram_top = prom_memory_limit;
+			rmo_top = min(rmo_top, prom_memory_limit);
+		}
+	}
+
+	/*
+	 * Setup our top alloc point, that is top of RMO or top of
+	 * segment 0 when running non-LPAR.
+	 * Some RS64 machines have buggy firmware where claims up at
+	 * 1GB fail.  Cap at 768MB as a workaround.
+	 * Since 768MB is plenty of room, and we need to cap to something
+	 * reasonable on 32-bit, cap at 768MB on all machines.
+	 */
+	if (!rmo_top)
+		rmo_top = ram_top;
+	rmo_top = min(0x30000000ul, rmo_top);
+	alloc_top = rmo_top;
+	alloc_top_high = ram_top;
+
+	/*
+	 * Check if we have an initrd after the kernel but still inside
+	 * the RMO.  If we do move our bottom point to after it.
+	 */
+	if (prom_initrd_start &&
+	    prom_initrd_start < rmo_top &&
+	    prom_initrd_end > alloc_bottom)
+		alloc_bottom = PAGE_ALIGN(prom_initrd_end);
+
+	prom_printf("memory layout at init:\n");
+	prom_printf("  memory_limit : %lx (16 MB aligned)\n",
+		    prom_memory_limit);
+	prom_printf("  alloc_bottom : %lx\n", alloc_bottom);
+	prom_printf("  alloc_top    : %lx\n", alloc_top);
+	prom_printf("  alloc_top_hi : %lx\n", alloc_top_high);
+	prom_printf("  rmo_top      : %lx\n", rmo_top);
+	prom_printf("  ram_top      : %lx\n", ram_top);
+}
+
+static void __init prom_close_stdin(void)
+{
+	__be32 val;
+	ihandle stdin;
+
+	if (prom_getprop(prom.chosen, "stdin", &val, sizeof(val)) > 0) {
+		stdin = be32_to_cpu(val);
+		call_prom("close", 1, 0, stdin);
+	}
+}
+
+#ifdef CONFIG_PPC_POWERNV
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+static u64 __initdata prom_opal_base;
+static u64 __initdata prom_opal_entry;
+#endif
+
+/*
+ * Allocate room for and instantiate OPAL
+ */
+static void __init prom_instantiate_opal(void)
+{
+	phandle opal_node;
+	ihandle opal_inst;
+	u64 base, entry;
+	u64 size = 0, align = 0x10000;
+	__be64 val64;
+	u32 rets[2];
+
+	prom_debug("prom_instantiate_opal: start...\n");
+
+	opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal"));
+	prom_debug("opal_node: %x\n", opal_node);
+	if (!PHANDLE_VALID(opal_node))
+		return;
+
+	val64 = 0;
+	prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64));
+	size = be64_to_cpu(val64);
+	if (size == 0)
+		return;
+	val64 = 0;
+	prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64));
+	align = be64_to_cpu(val64);
+
+	base = alloc_down(size, align, 0);
+	if (base == 0) {
+		prom_printf("OPAL allocation failed !\n");
+		return;
+	}
+
+	opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal"));
+	if (!IHANDLE_VALID(opal_inst)) {
+		prom_printf("opening opal package failed (%x)\n", opal_inst);
+		return;
+	}
+
+	prom_printf("instantiating opal at 0x%llx...", base);
+
+	if (call_prom_ret("call-method", 4, 3, rets,
+			  ADDR("load-opal-runtime"),
+			  opal_inst,
+			  base >> 32, base & 0xffffffff) != 0
+	    || (rets[0] == 0 && rets[1] == 0)) {
+		prom_printf(" failed\n");
+		return;
+	}
+	entry = (((u64)rets[0]) << 32) | rets[1];
+
+	prom_printf(" done\n");
+
+	reserve_mem(base, size);
+
+	prom_debug("opal base     = 0x%llx\n", base);
+	prom_debug("opal align    = 0x%llx\n", align);
+	prom_debug("opal entry    = 0x%llx\n", entry);
+	prom_debug("opal size     = 0x%llx\n", size);
+
+	prom_setprop(opal_node, "/ibm,opal", "opal-base-address",
+		     &base, sizeof(base));
+	prom_setprop(opal_node, "/ibm,opal", "opal-entry-address",
+		     &entry, sizeof(entry));
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+	prom_opal_base = base;
+	prom_opal_entry = entry;
+#endif
+	prom_debug("prom_instantiate_opal: end...\n");
+}
+
+#endif /* CONFIG_PPC_POWERNV */
+
+/*
+ * Allocate room for and instantiate RTAS
+ */
+static void __init prom_instantiate_rtas(void)
+{
+	phandle rtas_node;
+	ihandle rtas_inst;
+	u32 base, entry = 0;
+	__be32 val;
+	u32 size = 0;
+
+	prom_debug("prom_instantiate_rtas: start...\n");
+
+	rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas"));
+	prom_debug("rtas_node: %x\n", rtas_node);
+	if (!PHANDLE_VALID(rtas_node))
+		return;
+
+	val = 0;
+	prom_getprop(rtas_node, "rtas-size", &val, sizeof(size));
+	size = be32_to_cpu(val);
+	if (size == 0)
+		return;
+
+	base = alloc_down(size, PAGE_SIZE, 0);
+	if (base == 0)
+		prom_panic("Could not allocate memory for RTAS\n");
+
+	rtas_inst = call_prom("open", 1, 1, ADDR("/rtas"));
+	if (!IHANDLE_VALID(rtas_inst)) {
+		prom_printf("opening rtas package failed (%x)\n", rtas_inst);
+		return;
+	}
+
+	prom_printf("instantiating rtas at 0x%x...", base);
+
+	if (call_prom_ret("call-method", 3, 2, &entry,
+			  ADDR("instantiate-rtas"),
+			  rtas_inst, base) != 0
+	    || entry == 0) {
+		prom_printf(" failed\n");
+		return;
+	}
+	prom_printf(" done\n");
+
+	reserve_mem(base, size);
+
+	val = cpu_to_be32(base);
+	prom_setprop(rtas_node, "/rtas", "linux,rtas-base",
+		     &val, sizeof(val));
+	val = cpu_to_be32(entry);
+	prom_setprop(rtas_node, "/rtas", "linux,rtas-entry",
+		     &val, sizeof(val));
+
+	/* Check if it supports "query-cpu-stopped-state" */
+	if (prom_getprop(rtas_node, "query-cpu-stopped-state",
+			 &val, sizeof(val)) != PROM_ERROR)
+		rtas_has_query_cpu_stopped = true;
+
+	prom_debug("rtas base     = 0x%x\n", base);
+	prom_debug("rtas entry    = 0x%x\n", entry);
+	prom_debug("rtas size     = 0x%x\n", size);
+
+	prom_debug("prom_instantiate_rtas: end...\n");
+}
+
+#ifdef CONFIG_PPC64
+/*
+ * Allocate room for and instantiate Stored Measurement Log (SML)
+ */
+static void __init prom_instantiate_sml(void)
+{
+	phandle ibmvtpm_node;
+	ihandle ibmvtpm_inst;
+	u32 entry = 0, size = 0, succ = 0;
+	u64 base;
+	__be32 val;
+
+	prom_debug("prom_instantiate_sml: start...\n");
+
+	ibmvtpm_node = call_prom("finddevice", 1, 1, ADDR("/vdevice/vtpm"));
+	prom_debug("ibmvtpm_node: %x\n", ibmvtpm_node);
+	if (!PHANDLE_VALID(ibmvtpm_node))
+		return;
+
+	ibmvtpm_inst = call_prom("open", 1, 1, ADDR("/vdevice/vtpm"));
+	if (!IHANDLE_VALID(ibmvtpm_inst)) {
+		prom_printf("opening vtpm package failed (%x)\n", ibmvtpm_inst);
+		return;
+	}
+
+	if (prom_getprop(ibmvtpm_node, "ibm,sml-efi-reformat-supported",
+			 &val, sizeof(val)) != PROM_ERROR) {
+		if (call_prom_ret("call-method", 2, 2, &succ,
+				  ADDR("reformat-sml-to-efi-alignment"),
+				  ibmvtpm_inst) != 0 || succ == 0) {
+			prom_printf("Reformat SML to EFI alignment failed\n");
+			return;
+		}
+
+		if (call_prom_ret("call-method", 2, 2, &size,
+				  ADDR("sml-get-allocated-size"),
+				  ibmvtpm_inst) != 0 || size == 0) {
+			prom_printf("SML get allocated size failed\n");
+			return;
+		}
+	} else {
+		if (call_prom_ret("call-method", 2, 2, &size,
+				  ADDR("sml-get-handover-size"),
+				  ibmvtpm_inst) != 0 || size == 0) {
+			prom_printf("SML get handover size failed\n");
+			return;
+		}
+	}
+
+	base = alloc_down(size, PAGE_SIZE, 0);
+	if (base == 0)
+		prom_panic("Could not allocate memory for sml\n");
+
+	prom_printf("instantiating sml at 0x%llx...", base);
+
+	memset((void *)base, 0, size);
+
+	if (call_prom_ret("call-method", 4, 2, &entry,
+			  ADDR("sml-handover"),
+			  ibmvtpm_inst, size, base) != 0 || entry == 0) {
+		prom_printf("SML handover failed\n");
+		return;
+	}
+	prom_printf(" done\n");
+
+	reserve_mem(base, size);
+
+	prom_setprop(ibmvtpm_node, "/vdevice/vtpm", "linux,sml-base",
+		     &base, sizeof(base));
+	prom_setprop(ibmvtpm_node, "/vdevice/vtpm", "linux,sml-size",
+		     &size, sizeof(size));
+
+	prom_debug("sml base     = 0x%llx\n", base);
+	prom_debug("sml size     = 0x%x\n", size);
+
+	prom_debug("prom_instantiate_sml: end...\n");
+}
+
+/*
+ * Allocate room for and initialize TCE tables
+ */
+#ifdef __BIG_ENDIAN__
+static void __init prom_initialize_tce_table(void)
+{
+	phandle node;
+	ihandle phb_node;
+	char compatible[64], type[64], model[64];
+	char *path = prom_scratch;
+	u64 base, align;
+	u32 minalign, minsize;
+	u64 tce_entry, *tce_entryp;
+	u64 local_alloc_top, local_alloc_bottom;
+	u64 i;
+
+	if (prom_iommu_off)
+		return;
+
+	prom_debug("starting prom_initialize_tce_table\n");
+
+	/* Cache current top of allocs so we reserve a single block */
+	local_alloc_top = alloc_top_high;
+	local_alloc_bottom = local_alloc_top;
+
+	/* Search all nodes looking for PHBs. */
+	for (node = 0; prom_next_node(&node); ) {
+		compatible[0] = 0;
+		type[0] = 0;
+		model[0] = 0;
+		prom_getprop(node, "compatible",
+			     compatible, sizeof(compatible));
+		prom_getprop(node, "device_type", type, sizeof(type));
+		prom_getprop(node, "model", model, sizeof(model));
+
+		if ((type[0] == 0) || (strstr(type, "pci") == NULL))
+			continue;
+
+		/* Keep the old logic intact to avoid regression. */
+		if (compatible[0] != 0) {
+			if ((strstr(compatible, "python") == NULL) &&
+			    (strstr(compatible, "Speedwagon") == NULL) &&
+			    (strstr(compatible, "Winnipeg") == NULL))
+				continue;
+		} else if (model[0] != 0) {
+			if ((strstr(model, "ython") == NULL) &&
+			    (strstr(model, "peedwagon") == NULL) &&
+			    (strstr(model, "innipeg") == NULL))
+				continue;
+		}
+
+		if (prom_getprop(node, "tce-table-minalign", &minalign,
+				 sizeof(minalign)) == PROM_ERROR)
+			minalign = 0;
+		if (prom_getprop(node, "tce-table-minsize", &minsize,
+				 sizeof(minsize)) == PROM_ERROR)
+			minsize = 4UL << 20;
+
+		/*
+		 * Even though we read what OF wants, we just set the table
+		 * size to 4 MB.  This is enough to map 2GB of PCI DMA space.
+		 * By doing this, we avoid the pitfalls of trying to DMA to
+		 * MMIO space and the DMA alias hole.
+		 */
+		minsize = 4UL << 20;
+
+		/* Align to the greater of the align or size */
+		align = max(minalign, minsize);
+		base = alloc_down(minsize, align, 1);
+		if (base == 0)
+			prom_panic("ERROR, cannot find space for TCE table.\n");
+		if (base < local_alloc_bottom)
+			local_alloc_bottom = base;
+
+		/* It seems OF doesn't null-terminate the path :-( */
+		memset(path, 0, PROM_SCRATCH_SIZE);
+		/* Call OF to setup the TCE hardware */
+		if (call_prom("package-to-path", 3, 1, node,
+			      path, PROM_SCRATCH_SIZE-1) == PROM_ERROR) {
+			prom_printf("package-to-path failed\n");
+		}
+
+		/* Save away the TCE table attributes for later use. */
+		prom_setprop(node, path, "linux,tce-base", &base, sizeof(base));
+		prom_setprop(node, path, "linux,tce-size", &minsize, sizeof(minsize));
+
+		prom_debug("TCE table: %s\n", path);
+		prom_debug("\tnode = 0x%x\n", node);
+		prom_debug("\tbase = 0x%llx\n", base);
+		prom_debug("\tsize = 0x%x\n", minsize);
+
+		/* Initialize the table to have a one-to-one mapping
+		 * over the allocated size.
+		 */
+		tce_entryp = (u64 *)base;
+		for (i = 0; i < (minsize >> 3) ;tce_entryp++, i++) {
+			tce_entry = (i << PAGE_SHIFT);
+			tce_entry |= 0x3;
+			*tce_entryp = tce_entry;
+		}
+
+		prom_printf("opening PHB %s", path);
+		phb_node = call_prom("open", 1, 1, path);
+		if (phb_node == 0)
+			prom_printf("... failed\n");
+		else
+			prom_printf("... done\n");
+
+		call_prom("call-method", 6, 0, ADDR("set-64-bit-addressing"),
+			  phb_node, -1, minsize,
+			  (u32) base, (u32) (base >> 32));
+		call_prom("close", 1, 0, phb_node);
+	}
+
+	reserve_mem(local_alloc_bottom, local_alloc_top - local_alloc_bottom);
+
+	/* These are only really needed if there is a memory limit in
+	 * effect, but we don't know so export them always. */
+	prom_tce_alloc_start = local_alloc_bottom;
+	prom_tce_alloc_end = local_alloc_top;
+
+	/* Flag the first invalid entry */
+	prom_debug("ending prom_initialize_tce_table\n");
+}
+#endif /* __BIG_ENDIAN__ */
+#endif /* CONFIG_PPC64 */
+
+/*
+ * With CHRP SMP we need to use the OF to start the other processors.
+ * We can't wait until smp_boot_cpus (the OF is trashed by then)
+ * so we have to put the processors into a holding pattern controlled
+ * by the kernel (not OF) before we destroy the OF.
+ *
+ * This uses a chunk of low memory, puts some holding pattern
+ * code there and sends the other processors off to there until
+ * smp_boot_cpus tells them to do something.  The holding pattern
+ * checks that address until its cpu # is there, when it is that
+ * cpu jumps to __secondary_start().  smp_boot_cpus() takes care
+ * of setting those values.
+ *
+ * We also use physical address 0x4 here to tell when a cpu
+ * is in its holding pattern code.
+ *
+ * -- Cort
+ */
+/*
+ * We want to reference the copy of __secondary_hold_* in the
+ * 0 - 0x100 address range
+ */
+#define LOW_ADDR(x)	(((unsigned long) &(x)) & 0xff)
+
+static void __init prom_hold_cpus(void)
+{
+	unsigned long i;
+	phandle node;
+	char type[64];
+	unsigned long *spinloop
+		= (void *) LOW_ADDR(__secondary_hold_spinloop);
+	unsigned long *acknowledge
+		= (void *) LOW_ADDR(__secondary_hold_acknowledge);
+	unsigned long secondary_hold = LOW_ADDR(__secondary_hold);
+
+	/*
+	 * On pseries, if RTAS supports "query-cpu-stopped-state",
+	 * we skip this stage, the CPUs will be started by the
+	 * kernel using RTAS.
+	 */
+	if ((of_platform == PLATFORM_PSERIES ||
+	     of_platform == PLATFORM_PSERIES_LPAR) &&
+	    rtas_has_query_cpu_stopped) {
+		prom_printf("prom_hold_cpus: skipped\n");
+		return;
+	}
+
+	prom_debug("prom_hold_cpus: start...\n");
+	prom_debug("    1) spinloop       = 0x%lx\n", (unsigned long)spinloop);
+	prom_debug("    1) *spinloop      = 0x%lx\n", *spinloop);
+	prom_debug("    1) acknowledge    = 0x%lx\n",
+		   (unsigned long)acknowledge);
+	prom_debug("    1) *acknowledge   = 0x%lx\n", *acknowledge);
+	prom_debug("    1) secondary_hold = 0x%lx\n", secondary_hold);
+
+	/* Set the common spinloop variable, so all of the secondary cpus
+	 * will block when they are awakened from their OF spinloop.
+	 * This must occur for both SMP and non SMP kernels, since OF will
+	 * be trashed when we move the kernel.
+	 */
+	*spinloop = 0;
+
+	/* look for cpus */
+	for (node = 0; prom_next_node(&node); ) {
+		unsigned int cpu_no;
+		__be32 reg;
+
+		type[0] = 0;
+		prom_getprop(node, "device_type", type, sizeof(type));
+		if (strcmp(type, "cpu") != 0)
+			continue;
+
+		/* Skip non-configured cpus. */
+		if (prom_getprop(node, "status", type, sizeof(type)) > 0)
+			if (strcmp(type, "okay") != 0)
+				continue;
+
+		reg = cpu_to_be32(-1); /* make sparse happy */
+		prom_getprop(node, "reg", &reg, sizeof(reg));
+		cpu_no = be32_to_cpu(reg);
+
+		prom_debug("cpu hw idx   = %u\n", cpu_no);
+
+		/* Init the acknowledge var which will be reset by
+		 * the secondary cpu when it awakens from its OF
+		 * spinloop.
+		 */
+		*acknowledge = (unsigned long)-1;
+
+		if (cpu_no != prom.cpu) {
+			/* Primary Thread of non-boot cpu or any thread */
+			prom_printf("starting cpu hw idx %u... ", cpu_no);
+			call_prom("start-cpu", 3, 0, node,
+				  secondary_hold, cpu_no);
+
+			for (i = 0; (i < 100000000) && 
+			     (*acknowledge == ((unsigned long)-1)); i++ )
+				mb();
+
+			if (*acknowledge == cpu_no)
+				prom_printf("done\n");
+			else
+				prom_printf("failed: %lx\n", *acknowledge);
+		}
+#ifdef CONFIG_SMP
+		else
+			prom_printf("boot cpu hw idx %u\n", cpu_no);
+#endif /* CONFIG_SMP */
+	}
+
+	prom_debug("prom_hold_cpus: end...\n");
+}
+
+
+static void __init prom_init_client_services(unsigned long pp)
+{
+	/* Get a handle to the prom entry point before anything else */
+	prom_entry = pp;
+
+	/* get a handle for the stdout device */
+	prom.chosen = call_prom("finddevice", 1, 1, ADDR("/chosen"));
+	if (!PHANDLE_VALID(prom.chosen))
+		prom_panic("cannot find chosen"); /* msg won't be printed :( */
+
+	/* get device tree root */
+	prom.root = call_prom("finddevice", 1, 1, ADDR("/"));
+	if (!PHANDLE_VALID(prom.root))
+		prom_panic("cannot find device tree root"); /* msg won't be printed :( */
+
+	prom.mmumap = 0;
+}
+
+#ifdef CONFIG_PPC32
+/*
+ * For really old powermacs, we need to map things we claim.
+ * For that, we need the ihandle of the mmu.
+ * Also, on the longtrail, we need to work around other bugs.
+ */
+static void __init prom_find_mmu(void)
+{
+	phandle oprom;
+	char version[64];
+
+	oprom = call_prom("finddevice", 1, 1, ADDR("/openprom"));
+	if (!PHANDLE_VALID(oprom))
+		return;
+	if (prom_getprop(oprom, "model", version, sizeof(version)) <= 0)
+		return;
+	version[sizeof(version) - 1] = 0;
+	/* XXX might need to add other versions here */
+	if (strcmp(version, "Open Firmware, 1.0.5") == 0)
+		of_workarounds = OF_WA_CLAIM;
+	else if (strncmp(version, "FirmWorks,3.", 12) == 0) {
+		of_workarounds = OF_WA_CLAIM | OF_WA_LONGTRAIL;
+		call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim");
+	} else
+		return;
+	prom.memory = call_prom("open", 1, 1, ADDR("/memory"));
+	prom_getprop(prom.chosen, "mmu", &prom.mmumap,
+		     sizeof(prom.mmumap));
+	prom.mmumap = be32_to_cpu(prom.mmumap);
+	if (!IHANDLE_VALID(prom.memory) || !IHANDLE_VALID(prom.mmumap))
+		of_workarounds &= ~OF_WA_CLAIM;		/* hmmm */
+}
+#else
+#define prom_find_mmu()
+#endif
+
+static void __init prom_init_stdout(void)
+{
+	char *path = of_stdout_device;
+	char type[16];
+	phandle stdout_node;
+	__be32 val;
+
+	if (prom_getprop(prom.chosen, "stdout", &val, sizeof(val)) <= 0)
+		prom_panic("cannot find stdout");
+
+	prom.stdout = be32_to_cpu(val);
+
+	/* Get the full OF pathname of the stdout device */
+	memset(path, 0, 256);
+	call_prom("instance-to-path", 3, 1, prom.stdout, path, 255);
+	prom_printf("OF stdout device is: %s\n", of_stdout_device);
+	prom_setprop(prom.chosen, "/chosen", "linux,stdout-path",
+		     path, strlen(path) + 1);
+
+	/* instance-to-package fails on PA-Semi */
+	stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout);
+	if (stdout_node != PROM_ERROR) {
+		val = cpu_to_be32(stdout_node);
+
+		/* If it's a display, note it */
+		memset(type, 0, sizeof(type));
+		prom_getprop(stdout_node, "device_type", type, sizeof(type));
+		if (strcmp(type, "display") == 0)
+			prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0);
+	}
+}
+
+static int __init prom_find_machine_type(void)
+{
+	char compat[256];
+	int len, i = 0;
+#ifdef CONFIG_PPC64
+	phandle rtas;
+	int x;
+#endif
+
+	/* Look for a PowerMac or a Cell */
+	len = prom_getprop(prom.root, "compatible",
+			   compat, sizeof(compat)-1);
+	if (len > 0) {
+		compat[len] = 0;
+		while (i < len) {
+			char *p = &compat[i];
+			int sl = strlen(p);
+			if (sl == 0)
+				break;
+			if (strstr(p, "Power Macintosh") ||
+			    strstr(p, "MacRISC"))
+				return PLATFORM_POWERMAC;
+#ifdef CONFIG_PPC64
+			/* We must make sure we don't detect the IBM Cell
+			 * blades as pSeries due to some firmware issues,
+			 * so we do it here.
+			 */
+			if (strstr(p, "IBM,CBEA") ||
+			    strstr(p, "IBM,CPBW-1.0"))
+				return PLATFORM_GENERIC;
+#endif /* CONFIG_PPC64 */
+			i += sl + 1;
+		}
+	}
+#ifdef CONFIG_PPC64
+	/* Try to detect OPAL */
+	if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal"))))
+		return PLATFORM_OPAL;
+
+	/* Try to figure out if it's an IBM pSeries or any other
+	 * PAPR compliant platform. We assume it is if :
+	 *  - /device_type is "chrp" (please, do NOT use that for future
+	 *    non-IBM designs !
+	 *  - it has /rtas
+	 */
+	len = prom_getprop(prom.root, "device_type",
+			   compat, sizeof(compat)-1);
+	if (len <= 0)
+		return PLATFORM_GENERIC;
+	if (strcmp(compat, "chrp"))
+		return PLATFORM_GENERIC;
+
+	/* Default to pSeries. We need to know if we are running LPAR */
+	rtas = call_prom("finddevice", 1, 1, ADDR("/rtas"));
+	if (!PHANDLE_VALID(rtas))
+		return PLATFORM_GENERIC;
+	x = prom_getproplen(rtas, "ibm,hypertas-functions");
+	if (x != PROM_ERROR) {
+		prom_debug("Hypertas detected, assuming LPAR !\n");
+		return PLATFORM_PSERIES_LPAR;
+	}
+	return PLATFORM_PSERIES;
+#else
+	return PLATFORM_GENERIC;
+#endif
+}
+
+static int __init prom_set_color(ihandle ih, int i, int r, int g, int b)
+{
+	return call_prom("call-method", 6, 1, ADDR("color!"), ih, i, b, g, r);
+}
+
+/*
+ * If we have a display that we don't know how to drive,
+ * we will want to try to execute OF's open method for it
+ * later.  However, OF will probably fall over if we do that
+ * we've taken over the MMU.
+ * So we check whether we will need to open the display,
+ * and if so, open it now.
+ */
+static void __init prom_check_displays(void)
+{
+	char type[16], *path;
+	phandle node;
+	ihandle ih;
+	int i;
+
+	static unsigned char default_colors[] = {
+		0x00, 0x00, 0x00,
+		0x00, 0x00, 0xaa,
+		0x00, 0xaa, 0x00,
+		0x00, 0xaa, 0xaa,
+		0xaa, 0x00, 0x00,
+		0xaa, 0x00, 0xaa,
+		0xaa, 0xaa, 0x00,
+		0xaa, 0xaa, 0xaa,
+		0x55, 0x55, 0x55,
+		0x55, 0x55, 0xff,
+		0x55, 0xff, 0x55,
+		0x55, 0xff, 0xff,
+		0xff, 0x55, 0x55,
+		0xff, 0x55, 0xff,
+		0xff, 0xff, 0x55,
+		0xff, 0xff, 0xff
+	};
+	const unsigned char *clut;
+
+	prom_debug("Looking for displays\n");
+	for (node = 0; prom_next_node(&node); ) {
+		memset(type, 0, sizeof(type));
+		prom_getprop(node, "device_type", type, sizeof(type));
+		if (strcmp(type, "display") != 0)
+			continue;
+
+		/* It seems OF doesn't null-terminate the path :-( */
+		path = prom_scratch;
+		memset(path, 0, PROM_SCRATCH_SIZE);
+
+		/*
+		 * leave some room at the end of the path for appending extra
+		 * arguments
+		 */
+		if (call_prom("package-to-path", 3, 1, node, path,
+			      PROM_SCRATCH_SIZE-10) == PROM_ERROR)
+			continue;
+		prom_printf("found display   : %s, opening... ", path);
+		
+		ih = call_prom("open", 1, 1, path);
+		if (ih == 0) {
+			prom_printf("failed\n");
+			continue;
+		}
+
+		/* Success */
+		prom_printf("done\n");
+		prom_setprop(node, path, "linux,opened", NULL, 0);
+
+		/* Setup a usable color table when the appropriate
+		 * method is available. Should update this to set-colors */
+		clut = default_colors;
+		for (i = 0; i < 16; i++, clut += 3)
+			if (prom_set_color(ih, i, clut[0], clut[1],
+					   clut[2]) != 0)
+				break;
+
+#ifdef CONFIG_LOGO_LINUX_CLUT224
+		clut = PTRRELOC(logo_linux_clut224.clut);
+		for (i = 0; i < logo_linux_clut224.clutsize; i++, clut += 3)
+			if (prom_set_color(ih, i + 32, clut[0], clut[1],
+					   clut[2]) != 0)
+				break;
+#endif /* CONFIG_LOGO_LINUX_CLUT224 */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+		if (prom_getprop(node, "linux,boot-display", NULL, 0) !=
+		    PROM_ERROR) {
+			u32 width, height, pitch, addr;
+
+			prom_printf("Setting btext !\n");
+			prom_getprop(node, "width", &width, 4);
+			prom_getprop(node, "height", &height, 4);
+			prom_getprop(node, "linebytes", &pitch, 4);
+			prom_getprop(node, "address", &addr, 4);
+			prom_printf("W=%d H=%d LB=%d addr=0x%x\n",
+				    width, height, pitch, addr);
+			btext_setup_display(width, height, 8, pitch, addr);
+		}
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
+	}
+}
+
+
+/* Return (relocated) pointer to this much memory: moves initrd if reqd. */
+static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end,
+			      unsigned long needed, unsigned long align)
+{
+	void *ret;
+
+	*mem_start = _ALIGN(*mem_start, align);
+	while ((*mem_start + needed) > *mem_end) {
+		unsigned long room, chunk;
+
+		prom_debug("Chunk exhausted, claiming more at %lx...\n",
+			   alloc_bottom);
+		room = alloc_top - alloc_bottom;
+		if (room > DEVTREE_CHUNK_SIZE)
+			room = DEVTREE_CHUNK_SIZE;
+		if (room < PAGE_SIZE)
+			prom_panic("No memory for flatten_device_tree "
+				   "(no room)\n");
+		chunk = alloc_up(room, 0);
+		if (chunk == 0)
+			prom_panic("No memory for flatten_device_tree "
+				   "(claim failed)\n");
+		*mem_end = chunk + room;
+	}
+
+	ret = (void *)*mem_start;
+	*mem_start += needed;
+
+	return ret;
+}
+
+#define dt_push_token(token, mem_start, mem_end) do { 			\
+		void *room = make_room(mem_start, mem_end, 4, 4);	\
+		*(__be32 *)room = cpu_to_be32(token);			\
+	} while(0)
+
+static unsigned long __init dt_find_string(char *str)
+{
+	char *s, *os;
+
+	s = os = (char *)dt_string_start;
+	s += 4;
+	while (s <  (char *)dt_string_end) {
+		if (strcmp(s, str) == 0)
+			return s - os;
+		s += strlen(s) + 1;
+	}
+	return 0;
+}
+
+/*
+ * The Open Firmware 1275 specification states properties must be 31 bytes or
+ * less, however not all firmwares obey this. Make it 64 bytes to be safe.
+ */
+#define MAX_PROPERTY_NAME 64
+
+static void __init scan_dt_build_strings(phandle node,
+					 unsigned long *mem_start,
+					 unsigned long *mem_end)
+{
+	char *prev_name, *namep, *sstart;
+	unsigned long soff;
+	phandle child;
+
+	sstart =  (char *)dt_string_start;
+
+	/* get and store all property names */
+	prev_name = "";
+	for (;;) {
+		/* 64 is max len of name including nul. */
+		namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1);
+		if (call_prom("nextprop", 3, 1, node, prev_name, namep) != 1) {
+			/* No more nodes: unwind alloc */
+			*mem_start = (unsigned long)namep;
+			break;
+		}
+
+ 		/* skip "name" */
+ 		if (strcmp(namep, "name") == 0) {
+ 			*mem_start = (unsigned long)namep;
+ 			prev_name = "name";
+ 			continue;
+ 		}
+		/* get/create string entry */
+		soff = dt_find_string(namep);
+		if (soff != 0) {
+			*mem_start = (unsigned long)namep;
+			namep = sstart + soff;
+		} else {
+			/* Trim off some if we can */
+			*mem_start = (unsigned long)namep + strlen(namep) + 1;
+			dt_string_end = *mem_start;
+		}
+		prev_name = namep;
+	}
+
+	/* do all our children */
+	child = call_prom("child", 1, 1, node);
+	while (child != 0) {
+		scan_dt_build_strings(child, mem_start, mem_end);
+		child = call_prom("peer", 1, 1, child);
+	}
+}
+
+static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
+					unsigned long *mem_end)
+{
+	phandle child;
+	char *namep, *prev_name, *sstart, *p, *ep, *lp, *path;
+	unsigned long soff;
+	unsigned char *valp;
+	static char pname[MAX_PROPERTY_NAME];
+	int l, room, has_phandle = 0;
+
+	dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end);
+
+	/* get the node's full name */
+	namep = (char *)*mem_start;
+	room = *mem_end - *mem_start;
+	if (room > 255)
+		room = 255;
+	l = call_prom("package-to-path", 3, 1, node, namep, room);
+	if (l >= 0) {
+		/* Didn't fit?  Get more room. */
+		if (l >= room) {
+			if (l >= *mem_end - *mem_start)
+				namep = make_room(mem_start, mem_end, l+1, 1);
+			call_prom("package-to-path", 3, 1, node, namep, l);
+		}
+		namep[l] = '\0';
+
+		/* Fixup an Apple bug where they have bogus \0 chars in the
+		 * middle of the path in some properties, and extract
+		 * the unit name (everything after the last '/').
+		 */
+		for (lp = p = namep, ep = namep + l; p < ep; p++) {
+			if (*p == '/')
+				lp = namep;
+			else if (*p != 0)
+				*lp++ = *p;
+		}
+		*lp = 0;
+		*mem_start = _ALIGN((unsigned long)lp + 1, 4);
+	}
+
+	/* get it again for debugging */
+	path = prom_scratch;
+	memset(path, 0, PROM_SCRATCH_SIZE);
+	call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
+
+	/* get and store all properties */
+	prev_name = "";
+	sstart = (char *)dt_string_start;
+	for (;;) {
+		if (call_prom("nextprop", 3, 1, node, prev_name,
+			      pname) != 1)
+			break;
+
+ 		/* skip "name" */
+ 		if (strcmp(pname, "name") == 0) {
+ 			prev_name = "name";
+ 			continue;
+ 		}
+
+		/* find string offset */
+		soff = dt_find_string(pname);
+		if (soff == 0) {
+			prom_printf("WARNING: Can't find string index for"
+				    " <%s>, node %s\n", pname, path);
+			break;
+		}
+		prev_name = sstart + soff;
+
+		/* get length */
+		l = call_prom("getproplen", 2, 1, node, pname);
+
+		/* sanity checks */
+		if (l == PROM_ERROR)
+			continue;
+
+		/* push property head */
+		dt_push_token(OF_DT_PROP, mem_start, mem_end);
+		dt_push_token(l, mem_start, mem_end);
+		dt_push_token(soff, mem_start, mem_end);
+
+		/* push property content */
+		valp = make_room(mem_start, mem_end, l, 4);
+		call_prom("getprop", 4, 1, node, pname, valp, l);
+		*mem_start = _ALIGN(*mem_start, 4);
+
+		if (!strcmp(pname, "phandle"))
+			has_phandle = 1;
+	}
+
+	/* Add a "linux,phandle" property if no "phandle" property already
+	 * existed (can happen with OPAL)
+	 */
+	if (!has_phandle) {
+		soff = dt_find_string("linux,phandle");
+		if (soff == 0)
+			prom_printf("WARNING: Can't find string index for"
+				    " <linux-phandle> node %s\n", path);
+		else {
+			dt_push_token(OF_DT_PROP, mem_start, mem_end);
+			dt_push_token(4, mem_start, mem_end);
+			dt_push_token(soff, mem_start, mem_end);
+			valp = make_room(mem_start, mem_end, 4, 4);
+			*(__be32 *)valp = cpu_to_be32(node);
+		}
+	}
+
+	/* do all our children */
+	child = call_prom("child", 1, 1, node);
+	while (child != 0) {
+		scan_dt_build_struct(child, mem_start, mem_end);
+		child = call_prom("peer", 1, 1, child);
+	}
+
+	dt_push_token(OF_DT_END_NODE, mem_start, mem_end);
+}
+
+static void __init flatten_device_tree(void)
+{
+	phandle root;
+	unsigned long mem_start, mem_end, room;
+	struct boot_param_header *hdr;
+	char *namep;
+	u64 *rsvmap;
+
+	/*
+	 * Check how much room we have between alloc top & bottom (+/- a
+	 * few pages), crop to 1MB, as this is our "chunk" size
+	 */
+	room = alloc_top - alloc_bottom - 0x4000;
+	if (room > DEVTREE_CHUNK_SIZE)
+		room = DEVTREE_CHUNK_SIZE;
+	prom_debug("starting device tree allocs at %lx\n", alloc_bottom);
+
+	/* Now try to claim that */
+	mem_start = (unsigned long)alloc_up(room, PAGE_SIZE);
+	if (mem_start == 0)
+		prom_panic("Can't allocate initial device-tree chunk\n");
+	mem_end = mem_start + room;
+
+	/* Get root of tree */
+	root = call_prom("peer", 1, 1, (phandle)0);
+	if (root == (phandle)0)
+		prom_panic ("couldn't get device tree root\n");
+
+	/* Build header and make room for mem rsv map */ 
+	mem_start = _ALIGN(mem_start, 4);
+	hdr = make_room(&mem_start, &mem_end,
+			sizeof(struct boot_param_header), 4);
+	dt_header_start = (unsigned long)hdr;
+	rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8);
+
+	/* Start of strings */
+	mem_start = PAGE_ALIGN(mem_start);
+	dt_string_start = mem_start;
+	mem_start += 4; /* hole */
+
+	/* Add "linux,phandle" in there, we'll need it */
+	namep = make_room(&mem_start, &mem_end, 16, 1);
+	strcpy(namep, "linux,phandle");
+	mem_start = (unsigned long)namep + strlen(namep) + 1;
+
+	/* Build string array */
+	prom_printf("Building dt strings...\n"); 
+	scan_dt_build_strings(root, &mem_start, &mem_end);
+	dt_string_end = mem_start;
+
+	/* Build structure */
+	mem_start = PAGE_ALIGN(mem_start);
+	dt_struct_start = mem_start;
+	prom_printf("Building dt structure...\n"); 
+	scan_dt_build_struct(root, &mem_start, &mem_end);
+	dt_push_token(OF_DT_END, &mem_start, &mem_end);
+	dt_struct_end = PAGE_ALIGN(mem_start);
+
+	/* Finish header */
+	hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu);
+	hdr->magic = cpu_to_be32(OF_DT_HEADER);
+	hdr->totalsize = cpu_to_be32(dt_struct_end - dt_header_start);
+	hdr->off_dt_struct = cpu_to_be32(dt_struct_start - dt_header_start);
+	hdr->off_dt_strings = cpu_to_be32(dt_string_start - dt_header_start);
+	hdr->dt_strings_size = cpu_to_be32(dt_string_end - dt_string_start);
+	hdr->off_mem_rsvmap = cpu_to_be32(((unsigned long)rsvmap) - dt_header_start);
+	hdr->version = cpu_to_be32(OF_DT_VERSION);
+	/* Version 16 is not backward compatible */
+	hdr->last_comp_version = cpu_to_be32(0x10);
+
+	/* Copy the reserve map in */
+	memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map));
+
+#ifdef DEBUG_PROM
+	{
+		int i;
+		prom_printf("reserved memory map:\n");
+		for (i = 0; i < mem_reserve_cnt; i++)
+			prom_printf("  %llx - %llx\n",
+				    be64_to_cpu(mem_reserve_map[i].base),
+				    be64_to_cpu(mem_reserve_map[i].size));
+	}
+#endif
+	/* Bump mem_reserve_cnt to cause further reservations to fail
+	 * since it's too late.
+	 */
+	mem_reserve_cnt = MEM_RESERVE_MAP_SIZE;
+
+	prom_printf("Device tree strings 0x%lx -> 0x%lx\n",
+		    dt_string_start, dt_string_end);
+	prom_printf("Device tree struct  0x%lx -> 0x%lx\n",
+		    dt_struct_start, dt_struct_end);
+}
+
+#ifdef CONFIG_PPC_MAPLE
+/* PIBS Version 1.05.0000 04/26/2005 has an incorrect /ht/isa/ranges property.
+ * The values are bad, and it doesn't even have the right number of cells. */
+static void __init fixup_device_tree_maple(void)
+{
+	phandle isa;
+	u32 rloc = 0x01002000; /* IO space; PCI device = 4 */
+	u32 isa_ranges[6];
+	char *name;
+
+	name = "/ht@0/isa@4";
+	isa = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(isa)) {
+		name = "/ht@0/isa@6";
+		isa = call_prom("finddevice", 1, 1, ADDR(name));
+		rloc = 0x01003000; /* IO space; PCI device = 6 */
+	}
+	if (!PHANDLE_VALID(isa))
+		return;
+
+	if (prom_getproplen(isa, "ranges") != 12)
+		return;
+	if (prom_getprop(isa, "ranges", isa_ranges, sizeof(isa_ranges))
+		== PROM_ERROR)
+		return;
+
+	if (isa_ranges[0] != 0x1 ||
+		isa_ranges[1] != 0xf4000000 ||
+		isa_ranges[2] != 0x00010000)
+		return;
+
+	prom_printf("Fixing up bogus ISA range on Maple/Apache...\n");
+
+	isa_ranges[0] = 0x1;
+	isa_ranges[1] = 0x0;
+	isa_ranges[2] = rloc;
+	isa_ranges[3] = 0x0;
+	isa_ranges[4] = 0x0;
+	isa_ranges[5] = 0x00010000;
+	prom_setprop(isa, name, "ranges",
+			isa_ranges, sizeof(isa_ranges));
+}
+
+#define CPC925_MC_START		0xf8000000
+#define CPC925_MC_LENGTH	0x1000000
+/* The values for memory-controller don't have right number of cells */
+static void __init fixup_device_tree_maple_memory_controller(void)
+{
+	phandle mc;
+	u32 mc_reg[4];
+	char *name = "/hostbridge@f8000000";
+	u32 ac, sc;
+
+	mc = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(mc))
+		return;
+
+	if (prom_getproplen(mc, "reg") != 8)
+		return;
+
+	prom_getprop(prom.root, "#address-cells", &ac, sizeof(ac));
+	prom_getprop(prom.root, "#size-cells", &sc, sizeof(sc));
+	if ((ac != 2) || (sc != 2))
+		return;
+
+	if (prom_getprop(mc, "reg", mc_reg, sizeof(mc_reg)) == PROM_ERROR)
+		return;
+
+	if (mc_reg[0] != CPC925_MC_START || mc_reg[1] != CPC925_MC_LENGTH)
+		return;
+
+	prom_printf("Fixing up bogus hostbridge on Maple...\n");
+
+	mc_reg[0] = 0x0;
+	mc_reg[1] = CPC925_MC_START;
+	mc_reg[2] = 0x0;
+	mc_reg[3] = CPC925_MC_LENGTH;
+	prom_setprop(mc, name, "reg", mc_reg, sizeof(mc_reg));
+}
+#else
+#define fixup_device_tree_maple()
+#define fixup_device_tree_maple_memory_controller()
+#endif
+
+#ifdef CONFIG_PPC_CHRP
+/*
+ * Pegasos and BriQ lacks the "ranges" property in the isa node
+ * Pegasos needs decimal IRQ 14/15, not hexadecimal
+ * Pegasos has the IDE configured in legacy mode, but advertised as native
+ */
+static void __init fixup_device_tree_chrp(void)
+{
+	phandle ph;
+	u32 prop[6];
+	u32 rloc = 0x01006000; /* IO space; PCI device = 12 */
+	char *name;
+	int rc;
+
+	name = "/pci@80000000/isa@c";
+	ph = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(ph)) {
+		name = "/pci@ff500000/isa@6";
+		ph = call_prom("finddevice", 1, 1, ADDR(name));
+		rloc = 0x01003000; /* IO space; PCI device = 6 */
+	}
+	if (PHANDLE_VALID(ph)) {
+		rc = prom_getproplen(ph, "ranges");
+		if (rc == 0 || rc == PROM_ERROR) {
+			prom_printf("Fixing up missing ISA range on Pegasos...\n");
+
+			prop[0] = 0x1;
+			prop[1] = 0x0;
+			prop[2] = rloc;
+			prop[3] = 0x0;
+			prop[4] = 0x0;
+			prop[5] = 0x00010000;
+			prom_setprop(ph, name, "ranges", prop, sizeof(prop));
+		}
+	}
+
+	name = "/pci@80000000/ide@C,1";
+	ph = call_prom("finddevice", 1, 1, ADDR(name));
+	if (PHANDLE_VALID(ph)) {
+		prom_printf("Fixing up IDE interrupt on Pegasos...\n");
+		prop[0] = 14;
+		prop[1] = 0x0;
+		prom_setprop(ph, name, "interrupts", prop, 2*sizeof(u32));
+		prom_printf("Fixing up IDE class-code on Pegasos...\n");
+		rc = prom_getprop(ph, "class-code", prop, sizeof(u32));
+		if (rc == sizeof(u32)) {
+			prop[0] &= ~0x5;
+			prom_setprop(ph, name, "class-code", prop, sizeof(u32));
+		}
+	}
+}
+#else
+#define fixup_device_tree_chrp()
+#endif
+
+#if defined(CONFIG_PPC64) && defined(CONFIG_PPC_PMAC)
+static void __init fixup_device_tree_pmac(void)
+{
+	phandle u3, i2c, mpic;
+	u32 u3_rev;
+	u32 interrupts[2];
+	u32 parent;
+
+	/* Some G5s have a missing interrupt definition, fix it up here */
+	u3 = call_prom("finddevice", 1, 1, ADDR("/u3@0,f8000000"));
+	if (!PHANDLE_VALID(u3))
+		return;
+	i2c = call_prom("finddevice", 1, 1, ADDR("/u3@0,f8000000/i2c@f8001000"));
+	if (!PHANDLE_VALID(i2c))
+		return;
+	mpic = call_prom("finddevice", 1, 1, ADDR("/u3@0,f8000000/mpic@f8040000"));
+	if (!PHANDLE_VALID(mpic))
+		return;
+
+	/* check if proper rev of u3 */
+	if (prom_getprop(u3, "device-rev", &u3_rev, sizeof(u3_rev))
+	    == PROM_ERROR)
+		return;
+	if (u3_rev < 0x35 || u3_rev > 0x39)
+		return;
+	/* does it need fixup ? */
+	if (prom_getproplen(i2c, "interrupts") > 0)
+		return;
+
+	prom_printf("fixing up bogus interrupts for u3 i2c...\n");
+
+	/* interrupt on this revision of u3 is number 0 and level */
+	interrupts[0] = 0;
+	interrupts[1] = 1;
+	prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupts",
+		     &interrupts, sizeof(interrupts));
+	parent = (u32)mpic;
+	prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupt-parent",
+		     &parent, sizeof(parent));
+}
+#else
+#define fixup_device_tree_pmac()
+#endif
+
+#ifdef CONFIG_PPC_EFIKA
+/*
+ * The MPC5200 FEC driver requires an phy-handle property to tell it how
+ * to talk to the phy.  If the phy-handle property is missing, then this
+ * function is called to add the appropriate nodes and link it to the
+ * ethernet node.
+ */
+static void __init fixup_device_tree_efika_add_phy(void)
+{
+	u32 node;
+	char prop[64];
+	int rv;
+
+	/* Check if /builtin/ethernet exists - bail if it doesn't */
+	node = call_prom("finddevice", 1, 1, ADDR("/builtin/ethernet"));
+	if (!PHANDLE_VALID(node))
+		return;
+
+	/* Check if the phy-handle property exists - bail if it does */
+	rv = prom_getprop(node, "phy-handle", prop, sizeof(prop));
+	if (rv <= 0)
+		return;
+
+	/*
+	 * At this point the ethernet device doesn't have a phy described.
+	 * Now we need to add the missing phy node and linkage
+	 */
+
+	/* Check for an MDIO bus node - if missing then create one */
+	node = call_prom("finddevice", 1, 1, ADDR("/builtin/mdio"));
+	if (!PHANDLE_VALID(node)) {
+		prom_printf("Adding Ethernet MDIO node\n");
+		call_prom("interpret", 1, 1,
+			" s\" /builtin\" find-device"
+			" new-device"
+				" 1 encode-int s\" #address-cells\" property"
+				" 0 encode-int s\" #size-cells\" property"
+				" s\" mdio\" device-name"
+				" s\" fsl,mpc5200b-mdio\" encode-string"
+				" s\" compatible\" property"
+				" 0xf0003000 0x400 reg"
+				" 0x2 encode-int"
+				" 0x5 encode-int encode+"
+				" 0x3 encode-int encode+"
+				" s\" interrupts\" property"
+			" finish-device");
+	};
+
+	/* Check for a PHY device node - if missing then create one and
+	 * give it's phandle to the ethernet node */
+	node = call_prom("finddevice", 1, 1,
+			 ADDR("/builtin/mdio/ethernet-phy"));
+	if (!PHANDLE_VALID(node)) {
+		prom_printf("Adding Ethernet PHY node\n");
+		call_prom("interpret", 1, 1,
+			" s\" /builtin/mdio\" find-device"
+			" new-device"
+				" s\" ethernet-phy\" device-name"
+				" 0x10 encode-int s\" reg\" property"
+				" my-self"
+				" ihandle>phandle"
+			" finish-device"
+			" s\" /builtin/ethernet\" find-device"
+				" encode-int"
+				" s\" phy-handle\" property"
+			" device-end");
+	}
+}
+
+static void __init fixup_device_tree_efika(void)
+{
+	int sound_irq[3] = { 2, 2, 0 };
+	int bcomm_irq[3*16] = { 3,0,0, 3,1,0, 3,2,0, 3,3,0,
+				3,4,0, 3,5,0, 3,6,0, 3,7,0,
+				3,8,0, 3,9,0, 3,10,0, 3,11,0,
+				3,12,0, 3,13,0, 3,14,0, 3,15,0 };
+	u32 node;
+	char prop[64];
+	int rv, len;
+
+	/* Check if we're really running on a EFIKA */
+	node = call_prom("finddevice", 1, 1, ADDR("/"));
+	if (!PHANDLE_VALID(node))
+		return;
+
+	rv = prom_getprop(node, "model", prop, sizeof(prop));
+	if (rv == PROM_ERROR)
+		return;
+	if (strcmp(prop, "EFIKA5K2"))
+		return;
+
+	prom_printf("Applying EFIKA device tree fixups\n");
+
+	/* Claiming to be 'chrp' is death */
+	node = call_prom("finddevice", 1, 1, ADDR("/"));
+	rv = prom_getprop(node, "device_type", prop, sizeof(prop));
+	if (rv != PROM_ERROR && (strcmp(prop, "chrp") == 0))
+		prom_setprop(node, "/", "device_type", "efika", sizeof("efika"));
+
+	/* CODEGEN,description is exposed in /proc/cpuinfo so
+	   fix that too */
+	rv = prom_getprop(node, "CODEGEN,description", prop, sizeof(prop));
+	if (rv != PROM_ERROR && (strstr(prop, "CHRP")))
+		prom_setprop(node, "/", "CODEGEN,description",
+			     "Efika 5200B PowerPC System",
+			     sizeof("Efika 5200B PowerPC System"));
+
+	/* Fixup bestcomm interrupts property */
+	node = call_prom("finddevice", 1, 1, ADDR("/builtin/bestcomm"));
+	if (PHANDLE_VALID(node)) {
+		len = prom_getproplen(node, "interrupts");
+		if (len == 12) {
+			prom_printf("Fixing bestcomm interrupts property\n");
+			prom_setprop(node, "/builtin/bestcom", "interrupts",
+				     bcomm_irq, sizeof(bcomm_irq));
+		}
+	}
+
+	/* Fixup sound interrupts property */
+	node = call_prom("finddevice", 1, 1, ADDR("/builtin/sound"));
+	if (PHANDLE_VALID(node)) {
+		rv = prom_getprop(node, "interrupts", prop, sizeof(prop));
+		if (rv == PROM_ERROR) {
+			prom_printf("Adding sound interrupts property\n");
+			prom_setprop(node, "/builtin/sound", "interrupts",
+				     sound_irq, sizeof(sound_irq));
+		}
+	}
+
+	/* Make sure ethernet phy-handle property exists */
+	fixup_device_tree_efika_add_phy();
+}
+#else
+#define fixup_device_tree_efika()
+#endif
+
+#ifdef CONFIG_PPC_PASEMI_NEMO
+/*
+ * CFE supplied on Nemo is broken in several ways, biggest
+ * problem is that it reassigns ISA interrupts to unused mpic ints.
+ * Add an interrupt-controller property for the io-bridge to use
+ * and correct the ints so we can attach them to an irq_domain
+ */
+static void __init fixup_device_tree_pasemi(void)
+{
+	u32 interrupts[2], parent, rval, val = 0;
+	char *name, *pci_name;
+	phandle iob, node;
+
+	/* Find the root pci node */
+	name = "/pxp@0,e0000000";
+	iob = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(iob))
+		return;
+
+	/* check if interrupt-controller node set yet */
+	if (prom_getproplen(iob, "interrupt-controller") !=PROM_ERROR)
+		return;
+
+	prom_printf("adding interrupt-controller property for SB600...\n");
+
+	prom_setprop(iob, name, "interrupt-controller", &val, 0);
+
+	pci_name = "/pxp@0,e0000000/pci@11";
+	node = call_prom("finddevice", 1, 1, ADDR(pci_name));
+	parent = ADDR(iob);
+
+	for( ; prom_next_node(&node); ) {
+		/* scan each node for one with an interrupt */
+		if (!PHANDLE_VALID(node))
+			continue;
+
+		rval = prom_getproplen(node, "interrupts");
+		if (rval == 0 || rval == PROM_ERROR)
+			continue;
+
+		prom_getprop(node, "interrupts", &interrupts, sizeof(interrupts));
+		if ((interrupts[0] < 212) || (interrupts[0] > 222))
+			continue;
+
+		/* found a node, update both interrupts and interrupt-parent */
+		if ((interrupts[0] >= 212) && (interrupts[0] <= 215))
+			interrupts[0] -= 203;
+		if ((interrupts[0] >= 216) && (interrupts[0] <= 220))
+			interrupts[0] -= 213;
+		if (interrupts[0] == 221)
+			interrupts[0] = 14;
+		if (interrupts[0] == 222)
+			interrupts[0] = 8;
+
+		prom_setprop(node, pci_name, "interrupts", interrupts,
+					sizeof(interrupts));
+		prom_setprop(node, pci_name, "interrupt-parent", &parent,
+					sizeof(parent));
+	}
+
+	/*
+	 * The io-bridge has device_type set to 'io-bridge' change it to 'isa'
+	 * so that generic isa-bridge code can add the SB600 and its on-board
+	 * peripherals.
+	 */
+	name = "/pxp@0,e0000000/io-bridge@0";
+	iob = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(iob))
+		return;
+
+	/* device_type is already set, just change it. */
+
+	prom_printf("Changing device_type of SB600 node...\n");
+
+	prom_setprop(iob, name, "device_type", "isa", sizeof("isa"));
+}
+#else	/* !CONFIG_PPC_PASEMI_NEMO */
+static inline void fixup_device_tree_pasemi(void) { }
+#endif
+
+static void __init fixup_device_tree(void)
+{
+	fixup_device_tree_maple();
+	fixup_device_tree_maple_memory_controller();
+	fixup_device_tree_chrp();
+	fixup_device_tree_pmac();
+	fixup_device_tree_efika();
+	fixup_device_tree_pasemi();
+}
+
+static void __init prom_find_boot_cpu(void)
+{
+	__be32 rval;
+	ihandle prom_cpu;
+	phandle cpu_pkg;
+
+	rval = 0;
+	if (prom_getprop(prom.chosen, "cpu", &rval, sizeof(rval)) <= 0)
+		return;
+	prom_cpu = be32_to_cpu(rval);
+
+	cpu_pkg = call_prom("instance-to-package", 1, 1, prom_cpu);
+
+	if (!PHANDLE_VALID(cpu_pkg))
+		return;
+
+	prom_getprop(cpu_pkg, "reg", &rval, sizeof(rval));
+	prom.cpu = be32_to_cpu(rval);
+
+	prom_debug("Booting CPU hw index = %d\n", prom.cpu);
+}
+
+static void __init prom_check_initrd(unsigned long r3, unsigned long r4)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (r3 && r4 && r4 != 0xdeadbeef) {
+		__be64 val;
+
+		prom_initrd_start = is_kernel_addr(r3) ? __pa(r3) : r3;
+		prom_initrd_end = prom_initrd_start + r4;
+
+		val = cpu_to_be64(prom_initrd_start);
+		prom_setprop(prom.chosen, "/chosen", "linux,initrd-start",
+			     &val, sizeof(val));
+		val = cpu_to_be64(prom_initrd_end);
+		prom_setprop(prom.chosen, "/chosen", "linux,initrd-end",
+			     &val, sizeof(val));
+
+		reserve_mem(prom_initrd_start,
+			    prom_initrd_end - prom_initrd_start);
+
+		prom_debug("initrd_start=0x%lx\n", prom_initrd_start);
+		prom_debug("initrd_end=0x%lx\n", prom_initrd_end);
+	}
+#endif /* CONFIG_BLK_DEV_INITRD */
+}
+
+#ifdef CONFIG_PPC64
+#ifdef CONFIG_RELOCATABLE
+static void reloc_toc(void)
+{
+}
+
+static void unreloc_toc(void)
+{
+}
+#else
+static void __reloc_toc(unsigned long offset, unsigned long nr_entries)
+{
+	unsigned long i;
+	unsigned long *toc_entry;
+
+	/* Get the start of the TOC by using r2 directly. */
+	asm volatile("addi %0,2,-0x8000" : "=b" (toc_entry));
+
+	for (i = 0; i < nr_entries; i++) {
+		*toc_entry = *toc_entry + offset;
+		toc_entry++;
+	}
+}
+
+static void reloc_toc(void)
+{
+	unsigned long offset = reloc_offset();
+	unsigned long nr_entries =
+		(__prom_init_toc_end - __prom_init_toc_start) / sizeof(long);
+
+	__reloc_toc(offset, nr_entries);
+
+	mb();
+}
+
+static void unreloc_toc(void)
+{
+	unsigned long offset = reloc_offset();
+	unsigned long nr_entries =
+		(__prom_init_toc_end - __prom_init_toc_start) / sizeof(long);
+
+	mb();
+
+	__reloc_toc(-offset, nr_entries);
+}
+#endif
+#endif
+
+/*
+ * We enter here early on, when the Open Firmware prom is still
+ * handling exceptions and the MMU hash table for us.
+ */
+
+unsigned long __init prom_init(unsigned long r3, unsigned long r4,
+			       unsigned long pp,
+			       unsigned long r6, unsigned long r7,
+			       unsigned long kbase)
+{	
+	unsigned long hdr;
+
+#ifdef CONFIG_PPC32
+	unsigned long offset = reloc_offset();
+	reloc_got2(offset);
+#else
+	reloc_toc();
+#endif
+
+	/*
+	 * First zero the BSS
+	 */
+	memset(&__bss_start, 0, __bss_stop - __bss_start);
+
+	/*
+	 * Init interface to Open Firmware, get some node references,
+	 * like /chosen
+	 */
+	prom_init_client_services(pp);
+
+	/*
+	 * See if this OF is old enough that we need to do explicit maps
+	 * and other workarounds
+	 */
+	prom_find_mmu();
+
+	/*
+	 * Init prom stdout device
+	 */
+	prom_init_stdout();
+
+	prom_printf("Preparing to boot %s", linux_banner);
+
+	/*
+	 * Get default machine type. At this point, we do not differentiate
+	 * between pSeries SMP and pSeries LPAR
+	 */
+	of_platform = prom_find_machine_type();
+	prom_printf("Detected machine type: %x\n", of_platform);
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+	/* Bail if this is a kdump kernel. */
+	if (PHYSICAL_START > 0)
+		prom_panic("Error: You can't boot a kdump kernel from OF!\n");
+#endif
+
+	/*
+	 * Check for an initrd
+	 */
+	prom_check_initrd(r3, r4);
+
+	/*
+	 * Do early parsing of command line
+	 */
+	early_cmdline_parse();
+
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+	/*
+	 * On pSeries, inform the firmware about our capabilities
+	 */
+	if (of_platform == PLATFORM_PSERIES ||
+	    of_platform == PLATFORM_PSERIES_LPAR)
+		prom_send_capabilities();
+#endif
+
+	/*
+	 * Copy the CPU hold code
+	 */
+	if (of_platform != PLATFORM_POWERMAC)
+		copy_and_flush(0, kbase, 0x100, 0);
+
+	/*
+	 * Initialize memory management within prom_init
+	 */
+	prom_init_mem();
+
+	/*
+	 * Determine which cpu is actually running right _now_
+	 */
+	prom_find_boot_cpu();
+
+	/* 
+	 * Initialize display devices
+	 */
+	prom_check_displays();
+
+#if defined(CONFIG_PPC64) && defined(__BIG_ENDIAN__)
+	/*
+	 * Initialize IOMMU (TCE tables) on pSeries. Do that before anything else
+	 * that uses the allocator, we need to make sure we get the top of memory
+	 * available for us here...
+	 */
+	if (of_platform == PLATFORM_PSERIES)
+		prom_initialize_tce_table();
+#endif
+
+	/*
+	 * On non-powermacs, try to instantiate RTAS. PowerMacs don't
+	 * have a usable RTAS implementation.
+	 */
+	if (of_platform != PLATFORM_POWERMAC &&
+	    of_platform != PLATFORM_OPAL)
+		prom_instantiate_rtas();
+
+#ifdef CONFIG_PPC_POWERNV
+	if (of_platform == PLATFORM_OPAL)
+		prom_instantiate_opal();
+#endif /* CONFIG_PPC_POWERNV */
+
+#ifdef CONFIG_PPC64
+	/* instantiate sml */
+	prom_instantiate_sml();
+#endif
+
+	/*
+	 * On non-powermacs, put all CPUs in spin-loops.
+	 *
+	 * PowerMacs use a different mechanism to spin CPUs
+	 *
+	 * (This must be done after instanciating RTAS)
+	 */
+	if (of_platform != PLATFORM_POWERMAC &&
+	    of_platform != PLATFORM_OPAL)
+		prom_hold_cpus();
+
+	/*
+	 * Fill in some infos for use by the kernel later on
+	 */
+	if (prom_memory_limit) {
+		__be64 val = cpu_to_be64(prom_memory_limit);
+		prom_setprop(prom.chosen, "/chosen", "linux,memory-limit",
+			     &val, sizeof(val));
+	}
+#ifdef CONFIG_PPC64
+	if (prom_iommu_off)
+		prom_setprop(prom.chosen, "/chosen", "linux,iommu-off",
+			     NULL, 0);
+
+	if (prom_iommu_force_on)
+		prom_setprop(prom.chosen, "/chosen", "linux,iommu-force-on",
+			     NULL, 0);
+
+	if (prom_tce_alloc_start) {
+		prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-start",
+			     &prom_tce_alloc_start,
+			     sizeof(prom_tce_alloc_start));
+		prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-end",
+			     &prom_tce_alloc_end,
+			     sizeof(prom_tce_alloc_end));
+	}
+#endif
+
+	/*
+	 * Fixup any known bugs in the device-tree
+	 */
+	fixup_device_tree();
+
+	/*
+	 * Now finally create the flattened device-tree
+	 */
+	prom_printf("copying OF device tree...\n");
+	flatten_device_tree();
+
+	/*
+	 * in case stdin is USB and still active on IBM machines...
+	 * Unfortunately quiesce crashes on some powermacs if we have
+	 * closed stdin already (in particular the powerbook 101). It
+	 * appears that the OPAL version of OFW doesn't like it either.
+	 */
+	if (of_platform != PLATFORM_POWERMAC &&
+	    of_platform != PLATFORM_OPAL)
+		prom_close_stdin();
+
+	/*
+	 * Call OF "quiesce" method to shut down pending DMA's from
+	 * devices etc...
+	 */
+	prom_printf("Quiescing Open Firmware ...\n");
+	call_prom("quiesce", 0, 0);
+
+	/*
+	 * And finally, call the kernel passing it the flattened device
+	 * tree and NULL as r5, thus triggering the new entry point which
+	 * is common to us and kexec
+	 */
+	hdr = dt_header_start;
+
+	/* Don't print anything after quiesce under OPAL, it crashes OFW */
+	if (of_platform != PLATFORM_OPAL) {
+		prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase);
+		prom_debug("->dt_header_start=0x%lx\n", hdr);
+	}
+
+#ifdef CONFIG_PPC32
+	reloc_got2(-offset);
+#else
+	unreloc_toc();
+#endif
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+	/* OPAL early debug gets the OPAL base & entry in r8 and r9 */
+	__start(hdr, kbase, 0, 0, 0,
+		prom_opal_base, prom_opal_entry);
+#else
+	__start(hdr, kbase, 0, 0, 0, 0, 0);
+#endif
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
new file mode 100644
index 000000000..acb6b9226
--- /dev/null
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+#
+# Copyright © 2008 IBM Corporation
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version
+# 2 of the License, or (at your option) any later version.
+
+# This script checks prom_init.o to see what external symbols it
+# is using, if it finds symbols not in the whitelist it returns
+# an error. The point of this is to discourage people from
+# intentionally or accidentally adding new code to prom_init.c
+# which has side effects on other parts of the kernel.
+
+# If you really need to reference something from prom_init.o add
+# it to the list below:
+
+WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
+_end enter_prom memcpy memset reloc_offset __secondary_hold
+__secondary_hold_acknowledge __secondary_hold_spinloop __start
+strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224
+reloc_got2 kernstart_addr memstart_addr linux_banner _stext
+__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC."
+
+NM="$1"
+OBJ="$2"
+
+ERROR=0
+
+for UNDEF in $($NM -u $OBJ | awk '{print $2}')
+do
+	# On 64-bit nm gives us the function descriptors, which have
+	# a leading . on the name, so strip it off here.
+	UNDEF="${UNDEF#.}"
+
+	if [ $KBUILD_VERBOSE ]; then
+		if [ $KBUILD_VERBOSE -ne 0 ]; then
+			echo "Checking prom_init.o symbol '$UNDEF'"
+		fi
+	fi
+
+	OK=0
+	for WHITE in $WHITELIST
+	do
+		if [ "$UNDEF" = "$WHITE" ]; then
+			OK=1
+			break
+		fi
+	done
+
+	# ignore register save/restore funcitons
+	case $UNDEF in
+	_restgpr_*|_restgpr0_*|_rest32gpr_*)
+		OK=1
+		;;
+	_savegpr_*|_savegpr0_*|_save32gpr_*)
+		OK=1
+		;;
+	esac
+
+	if [ $OK -eq 0 ]; then
+		ERROR=1
+		echo "Error: External symbol '$UNDEF' referenced" \
+		     "from prom_init.c" >&2
+	fi
+done
+
+exit $ERROR
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
new file mode 100644
index 000000000..9cb7f88df
--- /dev/null
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ioport.h>
+#include <linux/etherdevice.h>
+#include <linux/of_address.h>
+#include <asm/prom.h>
+
+void of_parse_dma_window(struct device_node *dn, const __be32 *dma_window,
+			 unsigned long *busno, unsigned long *phys,
+			 unsigned long *size)
+{
+	u32 cells;
+	const __be32 *prop;
+
+	/* busno is always one cell */
+	*busno = of_read_number(dma_window, 1);
+	dma_window++;
+
+	prop = of_get_property(dn, "ibm,#dma-address-cells", NULL);
+	if (!prop)
+		prop = of_get_property(dn, "#address-cells", NULL);
+
+	cells = prop ? of_read_number(prop, 1) : of_n_addr_cells(dn);
+	*phys = of_read_number(dma_window, cells);
+
+	dma_window += cells;
+
+	prop = of_get_property(dn, "ibm,#dma-size-cells", NULL);
+	cells = prop ? of_read_number(prop, 1) : of_n_size_cells(dn);
+	*size = of_read_number(dma_window, cells);
+}
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
new file mode 100644
index 000000000..073117ba7
--- /dev/null
+++ b/arch/powerpc/kernel/ptrace.c
@@ -0,0 +1,3342 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/m68k/kernel/ptrace.c"
+ *  Copyright (C) 1994 by Hamish Macdonald
+ *  Taken from linux/kernel/ptrace.c and modified for M680x0.
+ *  linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds
+ *
+ * Modified by Cort Dougan (cort@hq.fsmlabs.com)
+ * and Paul Mackerras (paulus@samba.org).
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file README.legal in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/regset.h>
+#include <linux/tracehook.h>
+#include <linux/elf.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/signal.h>
+#include <linux/seccomp.h>
+#include <linux/audit.h>
+#include <trace/syscall.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <linux/context_tracking.h>
+
+#include <linux/uaccess.h>
+#include <linux/pkeys.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/asm-prototypes.h>
+#include <asm/debug.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/*
+ * The parameter save area on the stack is used to store arguments being passed
+ * to callee function and is located at fixed offset from stack pointer.
+ */
+#ifdef CONFIG_PPC32
+#define PARAMETER_SAVE_AREA_OFFSET	24  /* bytes */
+#else /* CONFIG_PPC32 */
+#define PARAMETER_SAVE_AREA_OFFSET	48  /* bytes */
+#endif
+
+struct pt_regs_offset {
+	const char *name;
+	int offset;
+};
+
+#define STR(s)	#s			/* convert to string */
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define GPR_OFFSET_NAME(num)	\
+	{.name = STR(r##num), .offset = offsetof(struct pt_regs, gpr[num])}, \
+	{.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+#define TVSO(f)	(offsetof(struct thread_vr_state, f))
+#define TFSO(f)	(offsetof(struct thread_fp_state, f))
+#define TSO(f)	(offsetof(struct thread_struct, f))
+
+static const struct pt_regs_offset regoffset_table[] = {
+	GPR_OFFSET_NAME(0),
+	GPR_OFFSET_NAME(1),
+	GPR_OFFSET_NAME(2),
+	GPR_OFFSET_NAME(3),
+	GPR_OFFSET_NAME(4),
+	GPR_OFFSET_NAME(5),
+	GPR_OFFSET_NAME(6),
+	GPR_OFFSET_NAME(7),
+	GPR_OFFSET_NAME(8),
+	GPR_OFFSET_NAME(9),
+	GPR_OFFSET_NAME(10),
+	GPR_OFFSET_NAME(11),
+	GPR_OFFSET_NAME(12),
+	GPR_OFFSET_NAME(13),
+	GPR_OFFSET_NAME(14),
+	GPR_OFFSET_NAME(15),
+	GPR_OFFSET_NAME(16),
+	GPR_OFFSET_NAME(17),
+	GPR_OFFSET_NAME(18),
+	GPR_OFFSET_NAME(19),
+	GPR_OFFSET_NAME(20),
+	GPR_OFFSET_NAME(21),
+	GPR_OFFSET_NAME(22),
+	GPR_OFFSET_NAME(23),
+	GPR_OFFSET_NAME(24),
+	GPR_OFFSET_NAME(25),
+	GPR_OFFSET_NAME(26),
+	GPR_OFFSET_NAME(27),
+	GPR_OFFSET_NAME(28),
+	GPR_OFFSET_NAME(29),
+	GPR_OFFSET_NAME(30),
+	GPR_OFFSET_NAME(31),
+	REG_OFFSET_NAME(nip),
+	REG_OFFSET_NAME(msr),
+	REG_OFFSET_NAME(ctr),
+	REG_OFFSET_NAME(link),
+	REG_OFFSET_NAME(xer),
+	REG_OFFSET_NAME(ccr),
+#ifdef CONFIG_PPC64
+	REG_OFFSET_NAME(softe),
+#else
+	REG_OFFSET_NAME(mq),
+#endif
+	REG_OFFSET_NAME(trap),
+	REG_OFFSET_NAME(dar),
+	REG_OFFSET_NAME(dsisr),
+	REG_OFFSET_END,
+};
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static void flush_tmregs_to_thread(struct task_struct *tsk)
+{
+	/*
+	 * If task is not current, it will have been flushed already to
+	 * it's thread_struct during __switch_to().
+	 *
+	 * A reclaim flushes ALL the state or if not in TM save TM SPRs
+	 * in the appropriate thread structures from live.
+	 */
+
+	if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current))
+		return;
+
+	if (MSR_TM_SUSPENDED(mfmsr())) {
+		tm_reclaim_current(TM_CAUSE_SIGNAL);
+	} else {
+		tm_enable();
+		tm_save_sprs(&(tsk->thread));
+	}
+}
+#else
+static inline void flush_tmregs_to_thread(struct task_struct *tsk) { }
+#endif
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name:	the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+	const struct pt_regs_offset *roff;
+	for (roff = regoffset_table; roff->name != NULL; roff++)
+		if (!strcmp(roff->name, name))
+			return roff->offset;
+	return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset:	the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+	const struct pt_regs_offset *roff;
+	for (roff = regoffset_table; roff->name != NULL; roff++)
+		if (roff->offset == offset)
+			return roff->name;
+	return NULL;
+}
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Set of msr bits that gdb can change on behalf of a process.
+ */
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+#define MSR_DEBUGCHANGE	0
+#else
+#define MSR_DEBUGCHANGE	(MSR_SE | MSR_BE)
+#endif
+
+/*
+ * Max register writeable via put_reg
+ */
+#ifdef CONFIG_PPC32
+#define PT_MAX_PUT_REG	PT_MQ
+#else
+#define PT_MAX_PUT_REG	PT_CCR
+#endif
+
+static unsigned long get_user_msr(struct task_struct *task)
+{
+	return task->thread.regs->msr | task->thread.fpexc_mode;
+}
+
+static int set_user_msr(struct task_struct *task, unsigned long msr)
+{
+	task->thread.regs->msr &= ~MSR_DEBUGCHANGE;
+	task->thread.regs->msr |= msr & MSR_DEBUGCHANGE;
+	return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static unsigned long get_user_ckpt_msr(struct task_struct *task)
+{
+	return task->thread.ckpt_regs.msr | task->thread.fpexc_mode;
+}
+
+static int set_user_ckpt_msr(struct task_struct *task, unsigned long msr)
+{
+	task->thread.ckpt_regs.msr &= ~MSR_DEBUGCHANGE;
+	task->thread.ckpt_regs.msr |= msr & MSR_DEBUGCHANGE;
+	return 0;
+}
+
+static int set_user_ckpt_trap(struct task_struct *task, unsigned long trap)
+{
+	task->thread.ckpt_regs.trap = trap & 0xfff0;
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PPC64
+static int get_user_dscr(struct task_struct *task, unsigned long *data)
+{
+	*data = task->thread.dscr;
+	return 0;
+}
+
+static int set_user_dscr(struct task_struct *task, unsigned long dscr)
+{
+	task->thread.dscr = dscr;
+	task->thread.dscr_inherit = 1;
+	return 0;
+}
+#else
+static int get_user_dscr(struct task_struct *task, unsigned long *data)
+{
+	return -EIO;
+}
+
+static int set_user_dscr(struct task_struct *task, unsigned long dscr)
+{
+	return -EIO;
+}
+#endif
+
+/*
+ * We prevent mucking around with the reserved area of trap
+ * which are used internally by the kernel.
+ */
+static int set_user_trap(struct task_struct *task, unsigned long trap)
+{
+	task->thread.regs->trap = trap & 0xfff0;
+	return 0;
+}
+
+/*
+ * Get contents of register REGNO in task TASK.
+ */
+int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data)
+{
+	if ((task->thread.regs == NULL) || !data)
+		return -EIO;
+
+	if (regno == PT_MSR) {
+		*data = get_user_msr(task);
+		return 0;
+	}
+
+	if (regno == PT_DSCR)
+		return get_user_dscr(task, data);
+
+#ifdef CONFIG_PPC64
+	/*
+	 * softe copies paca->irq_soft_mask variable state. Since irq_soft_mask is
+	 * no more used as a flag, lets force usr to alway see the softe value as 1
+	 * which means interrupts are not soft disabled.
+	 */
+	if (regno == PT_SOFTE) {
+		*data = 1;
+		return  0;
+	}
+#endif
+
+	if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) {
+		*data = ((unsigned long *)task->thread.regs)[regno];
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/*
+ * Write contents of register REGNO in task TASK.
+ */
+int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data)
+{
+	if (task->thread.regs == NULL)
+		return -EIO;
+
+	if (regno == PT_MSR)
+		return set_user_msr(task, data);
+	if (regno == PT_TRAP)
+		return set_user_trap(task, data);
+	if (regno == PT_DSCR)
+		return set_user_dscr(task, data);
+
+	if (regno <= PT_MAX_PUT_REG) {
+		((unsigned long *)task->thread.regs)[regno] = data;
+		return 0;
+	}
+	return -EIO;
+}
+
+static int gpr_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	int i, ret;
+
+	if (target->thread.regs == NULL)
+		return -EIO;
+
+	if (!FULL_REGS(target->thread.regs)) {
+		/* We have a partial register set.  Fill 14-31 with bogus values */
+		for (i = 14; i < 32; i++)
+			target->thread.regs->gpr[i] = NV_REG_POISON;
+	}
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  target->thread.regs,
+				  0, offsetof(struct pt_regs, msr));
+	if (!ret) {
+		unsigned long msr = get_user_msr(target);
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr,
+					  offsetof(struct pt_regs, msr),
+					  offsetof(struct pt_regs, msr) +
+					  sizeof(msr));
+	}
+
+	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
+		     offsetof(struct pt_regs, msr) + sizeof(long));
+
+	if (!ret)
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					  &target->thread.regs->orig_gpr3,
+					  offsetof(struct pt_regs, orig_gpr3),
+					  sizeof(struct pt_regs));
+	if (!ret)
+		ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					       sizeof(struct pt_regs), -1);
+
+	return ret;
+}
+
+static int gpr_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	unsigned long reg;
+	int ret;
+
+	if (target->thread.regs == NULL)
+		return -EIO;
+
+	CHECK_FULL_REGS(target->thread.regs);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 target->thread.regs,
+				 0, PT_MSR * sizeof(reg));
+
+	if (!ret && count > 0) {
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
+					 PT_MSR * sizeof(reg),
+					 (PT_MSR + 1) * sizeof(reg));
+		if (!ret)
+			ret = set_user_msr(target, reg);
+	}
+
+	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
+		     offsetof(struct pt_regs, msr) + sizeof(long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					 &target->thread.regs->orig_gpr3,
+					 PT_ORIG_R3 * sizeof(reg),
+					 (PT_MAX_PUT_REG + 1) * sizeof(reg));
+
+	if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret)
+		ret = user_regset_copyin_ignore(
+			&pos, &count, &kbuf, &ubuf,
+			(PT_MAX_PUT_REG + 1) * sizeof(reg),
+			PT_TRAP * sizeof(reg));
+
+	if (!ret && count > 0) {
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
+					 PT_TRAP * sizeof(reg),
+					 (PT_TRAP + 1) * sizeof(reg));
+		if (!ret)
+			ret = set_user_trap(target, reg);
+	}
+
+	if (!ret)
+		ret = user_regset_copyin_ignore(
+			&pos, &count, &kbuf, &ubuf,
+			(PT_TRAP + 1) * sizeof(reg), -1);
+
+	return ret;
+}
+
+/*
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *	u64	fpr[32];
+ *	u64	fpscr;
+ * };
+ */
+static int fpr_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+#ifdef CONFIG_VSX
+	u64 buf[33];
+	int i;
+
+	flush_fp_to_thread(target);
+
+	/* copy to local buffer then write that out */
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.TS_FPR(i);
+	buf[32] = target->thread.fp_state.fpscr;
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+#else
+	BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
+		     offsetof(struct thread_fp_state, fpr[32]));
+
+	flush_fp_to_thread(target);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.fp_state, 0, -1);
+#endif
+}
+
+/*
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *	u64	fpr[32];
+ *	u64	fpscr;
+ * };
+ *
+ */
+static int fpr_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+#ifdef CONFIG_VSX
+	u64 buf[33];
+	int i;
+
+	flush_fp_to_thread(target);
+
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.TS_FPR(i);
+	buf[32] = target->thread.fp_state.fpscr;
+
+	/* copy to local buffer then write that out */
+	i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+	if (i)
+		return i;
+
+	for (i = 0; i < 32 ; i++)
+		target->thread.TS_FPR(i) = buf[i];
+	target->thread.fp_state.fpscr = buf[32];
+	return 0;
+#else
+	BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
+		     offsetof(struct thread_fp_state, fpr[32]));
+
+	flush_fp_to_thread(target);
+
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fp_state, 0, -1);
+#endif
+}
+
+#ifdef CONFIG_ALTIVEC
+/*
+ * Get/set all the altivec registers vr0..vr31, vscr, vrsave, in one go.
+ * The transfer totals 34 quadword.  Quadwords 0-31 contain the
+ * corresponding vector registers.  Quadword 32 contains the vscr as the
+ * last word (offset 12) within that quadword.  Quadword 33 contains the
+ * vrsave as the first word (offset 0) within the quadword.
+ *
+ * This definition of the VMX state is compatible with the current PPC32
+ * ptrace interface.  This allows signal handling and ptrace to use the
+ * same structures.  This also simplifies the implementation of a bi-arch
+ * (combined (32- and 64-bit) gdb.
+ */
+
+static int vr_active(struct task_struct *target,
+		     const struct user_regset *regset)
+{
+	flush_altivec_to_thread(target);
+	return target->thread.used_vr ? regset->n : 0;
+}
+
+/*
+ * Regardless of transactions, 'vr_state' holds the current running
+ * value of all the VMX registers and 'ckvr_state' holds the last
+ * checkpointed value of all the VMX registers for the current
+ * transaction to fall back on in case it aborts.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *	vector128	vr[32];
+ *	vector128	vscr;
+ *	vector128	vrsave;
+ * };
+ */
+static int vr_get(struct task_struct *target, const struct user_regset *regset,
+		  unsigned int pos, unsigned int count,
+		  void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	flush_altivec_to_thread(target);
+
+	BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
+		     offsetof(struct thread_vr_state, vr[32]));
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.vr_state, 0,
+				  33 * sizeof(vector128));
+	if (!ret) {
+		/*
+		 * Copy out only the low-order word of vrsave.
+		 */
+		int start, end;
+		union {
+			elf_vrreg_t reg;
+			u32 word;
+		} vrsave;
+		memset(&vrsave, 0, sizeof(vrsave));
+
+		vrsave.word = target->thread.vrsave;
+
+		start = 33 * sizeof(vector128);
+		end = start + sizeof(vrsave);
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
+					  start, end);
+	}
+
+	return ret;
+}
+
+/*
+ * Regardless of transactions, 'vr_state' holds the current running
+ * value of all the VMX registers and 'ckvr_state' holds the last
+ * checkpointed value of all the VMX registers for the current
+ * transaction to fall back on in case it aborts.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *	vector128	vr[32];
+ *	vector128	vscr;
+ *	vector128	vrsave;
+ * };
+ */
+static int vr_set(struct task_struct *target, const struct user_regset *regset,
+		  unsigned int pos, unsigned int count,
+		  const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	flush_altivec_to_thread(target);
+
+	BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
+		     offsetof(struct thread_vr_state, vr[32]));
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &target->thread.vr_state, 0,
+				 33 * sizeof(vector128));
+	if (!ret && count > 0) {
+		/*
+		 * We use only the first word of vrsave.
+		 */
+		int start, end;
+		union {
+			elf_vrreg_t reg;
+			u32 word;
+		} vrsave;
+		memset(&vrsave, 0, sizeof(vrsave));
+
+		vrsave.word = target->thread.vrsave;
+
+		start = 33 * sizeof(vector128);
+		end = start + sizeof(vrsave);
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
+					 start, end);
+		if (!ret)
+			target->thread.vrsave = vrsave.word;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_VSX
+/*
+ * Currently to set and and get all the vsx state, you need to call
+ * the fp and VMX calls as well.  This only get/sets the lower 32
+ * 128bit VSX registers.
+ */
+
+static int vsr_active(struct task_struct *target,
+		      const struct user_regset *regset)
+{
+	flush_vsx_to_thread(target);
+	return target->thread.used_vsr ? regset->n : 0;
+}
+
+/*
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last
+ * checkpointed value of all FPR registers for the current
+ * transaction.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *	u64	vsx[32];
+ * };
+ */
+static int vsr_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	u64 buf[32];
+	int ret, i;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+	flush_vsx_to_thread(target);
+
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  buf, 0, 32 * sizeof(double));
+
+	return ret;
+}
+
+/*
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last
+ * checkpointed value of all FPR registers for the current
+ * transaction.
+ *
+ * Userspace interface buffer layout:
+ *
+ * struct data {
+ *	u64	vsx[32];
+ * };
+ */
+static int vsr_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	u64 buf[32];
+	int ret,i;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+	flush_vsx_to_thread(target);
+
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 buf, 0, 32 * sizeof(double));
+	if (!ret)
+		for (i = 0; i < 32 ; i++)
+			target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+
+	return ret;
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_SPE
+
+/*
+ * For get_evrregs/set_evrregs functions 'data' has the following layout:
+ *
+ * struct {
+ *   u32 evr[32];
+ *   u64 acc;
+ *   u32 spefscr;
+ * }
+ */
+
+static int evr_active(struct task_struct *target,
+		      const struct user_regset *regset)
+{
+	flush_spe_to_thread(target);
+	return target->thread.used_spe ? regset->n : 0;
+}
+
+static int evr_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	flush_spe_to_thread(target);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.evr,
+				  0, sizeof(target->thread.evr));
+
+	BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) !=
+		     offsetof(struct thread_struct, spefscr));
+
+	if (!ret)
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					  &target->thread.acc,
+					  sizeof(target->thread.evr), -1);
+
+	return ret;
+}
+
+static int evr_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	flush_spe_to_thread(target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &target->thread.evr,
+				 0, sizeof(target->thread.evr));
+
+	BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) !=
+		     offsetof(struct thread_struct, spefscr));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					 &target->thread.acc,
+					 sizeof(target->thread.evr), -1);
+
+	return ret;
+}
+#endif /* CONFIG_SPE */
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/**
+ * tm_cgpr_active - get active number of registers in CGPR
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in transaction checkpointed GPR category.
+ */
+static int tm_cgpr_active(struct task_struct *target,
+			  const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return 0;
+
+	return regset->n;
+}
+
+/**
+ * tm_cgpr_get - get CGPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy from.
+ * @ubuf:	User buffer to copy into.
+ *
+ * This function gets transaction checkpointed GPR registers.
+ *
+ * When the transaction is active, 'ckpt_regs' holds all the checkpointed
+ * GPR register values for the current transaction to fall back on if it
+ * aborts in between. This function gets those checkpointed GPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *	struct pt_regs ckpt_regs;
+ * };
+ */
+static int tm_cgpr_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.ckpt_regs,
+				  0, offsetof(struct pt_regs, msr));
+	if (!ret) {
+		unsigned long msr = get_user_ckpt_msr(target);
+
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr,
+					  offsetof(struct pt_regs, msr),
+					  offsetof(struct pt_regs, msr) +
+					  sizeof(msr));
+	}
+
+	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
+		     offsetof(struct pt_regs, msr) + sizeof(long));
+
+	if (!ret)
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					  &target->thread.ckpt_regs.orig_gpr3,
+					  offsetof(struct pt_regs, orig_gpr3),
+					  sizeof(struct pt_regs));
+	if (!ret)
+		ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					       sizeof(struct pt_regs), -1);
+
+	return ret;
+}
+
+/*
+ * tm_cgpr_set - set the CGPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy into.
+ * @ubuf:	User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed GPR registers.
+ *
+ * When the transaction is active, 'ckpt_regs' holds the checkpointed
+ * GPR register values for the current transaction to fall back on if it
+ * aborts in between. This function sets those checkpointed GPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *	struct pt_regs ckpt_regs;
+ * };
+ */
+static int tm_cgpr_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
+{
+	unsigned long reg;
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &target->thread.ckpt_regs,
+				 0, PT_MSR * sizeof(reg));
+
+	if (!ret && count > 0) {
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
+					 PT_MSR * sizeof(reg),
+					 (PT_MSR + 1) * sizeof(reg));
+		if (!ret)
+			ret = set_user_ckpt_msr(target, reg);
+	}
+
+	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
+		     offsetof(struct pt_regs, msr) + sizeof(long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					 &target->thread.ckpt_regs.orig_gpr3,
+					 PT_ORIG_R3 * sizeof(reg),
+					 (PT_MAX_PUT_REG + 1) * sizeof(reg));
+
+	if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret)
+		ret = user_regset_copyin_ignore(
+			&pos, &count, &kbuf, &ubuf,
+			(PT_MAX_PUT_REG + 1) * sizeof(reg),
+			PT_TRAP * sizeof(reg));
+
+	if (!ret && count > 0) {
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
+					 PT_TRAP * sizeof(reg),
+					 (PT_TRAP + 1) * sizeof(reg));
+		if (!ret)
+			ret = set_user_ckpt_trap(target, reg);
+	}
+
+	if (!ret)
+		ret = user_regset_copyin_ignore(
+			&pos, &count, &kbuf, &ubuf,
+			(PT_TRAP + 1) * sizeof(reg), -1);
+
+	return ret;
+}
+
+/**
+ * tm_cfpr_active - get active number of registers in CFPR
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in transaction checkpointed FPR category.
+ */
+static int tm_cfpr_active(struct task_struct *target,
+				const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return 0;
+
+	return regset->n;
+}
+
+/**
+ * tm_cfpr_get - get CFPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy from.
+ * @ubuf:	User buffer to copy into.
+ *
+ * This function gets in transaction checkpointed FPR registers.
+ *
+ * When the transaction is active 'ckfp_state' holds the checkpointed
+ * values for the current transaction to fall back on if it aborts
+ * in between. This function gets those checkpointed FPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *	u64	fpr[32];
+ *	u64	fpscr;
+ *};
+ */
+static int tm_cfpr_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf)
+{
+	u64 buf[33];
+	int i;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	/* copy to local buffer then write that out */
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.TS_CKFPR(i);
+	buf[32] = target->thread.ckfp_state.fpscr;
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+}
+
+/**
+ * tm_cfpr_set - set CFPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy into.
+ * @ubuf:	User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed FPR registers.
+ *
+ * When the transaction is active 'ckfp_state' holds the checkpointed
+ * FPR register values for the current transaction to fall back on
+ * if it aborts in between. This function sets these checkpointed
+ * FPR registers. The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *	u64	fpr[32];
+ *	u64	fpscr;
+ *};
+ */
+static int tm_cfpr_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
+{
+	u64 buf[33];
+	int i;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	for (i = 0; i < 32; i++)
+		buf[i] = target->thread.TS_CKFPR(i);
+	buf[32] = target->thread.ckfp_state.fpscr;
+
+	/* copy to local buffer then write that out */
+	i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
+	if (i)
+		return i;
+	for (i = 0; i < 32 ; i++)
+		target->thread.TS_CKFPR(i) = buf[i];
+	target->thread.ckfp_state.fpscr = buf[32];
+	return 0;
+}
+
+/**
+ * tm_cvmx_active - get active number of registers in CVMX
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in checkpointed VMX category.
+ */
+static int tm_cvmx_active(struct task_struct *target,
+				const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return 0;
+
+	return regset->n;
+}
+
+/**
+ * tm_cvmx_get - get CMVX registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy from.
+ * @ubuf:	User buffer to copy into.
+ *
+ * This function gets in transaction checkpointed VMX registers.
+ *
+ * When the transaction is active 'ckvr_state' and 'ckvrsave' hold
+ * the checkpointed values for the current transaction to fall
+ * back on if it aborts in between. The userspace interface buffer
+ * layout is as follows.
+ *
+ * struct data {
+ *	vector128	vr[32];
+ *	vector128	vscr;
+ *	vector128	vrsave;
+ *};
+ */
+static int tm_cvmx_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32]));
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	/* Flush the state */
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					&target->thread.ckvr_state, 0,
+					33 * sizeof(vector128));
+	if (!ret) {
+		/*
+		 * Copy out only the low-order word of vrsave.
+		 */
+		union {
+			elf_vrreg_t reg;
+			u32 word;
+		} vrsave;
+		memset(&vrsave, 0, sizeof(vrsave));
+		vrsave.word = target->thread.ckvrsave;
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
+						33 * sizeof(vector128), -1);
+	}
+
+	return ret;
+}
+
+/**
+ * tm_cvmx_set - set CMVX registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy into.
+ * @ubuf:	User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed VMX registers.
+ *
+ * When the transaction is active 'ckvr_state' and 'ckvrsave' hold
+ * the checkpointed values for the current transaction to fall
+ * back on if it aborts in between. The userspace interface buffer
+ * layout is as follows.
+ *
+ * struct data {
+ *	vector128	vr[32];
+ *	vector128	vscr;
+ *	vector128	vrsave;
+ *};
+ */
+static int tm_cvmx_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	BUILD_BUG_ON(TVSO(vscr) != TVSO(vr[32]));
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					&target->thread.ckvr_state, 0,
+					33 * sizeof(vector128));
+	if (!ret && count > 0) {
+		/*
+		 * We use only the low-order word of vrsave.
+		 */
+		union {
+			elf_vrreg_t reg;
+			u32 word;
+		} vrsave;
+		memset(&vrsave, 0, sizeof(vrsave));
+		vrsave.word = target->thread.ckvrsave;
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
+						33 * sizeof(vector128), -1);
+		if (!ret)
+			target->thread.ckvrsave = vrsave.word;
+	}
+
+	return ret;
+}
+
+/**
+ * tm_cvsx_active - get active number of registers in CVSX
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ *
+ * This function checks for the active number of available
+ * regisers in transaction checkpointed VSX category.
+ */
+static int tm_cvsx_active(struct task_struct *target,
+				const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return 0;
+
+	flush_vsx_to_thread(target);
+	return target->thread.used_vsr ? regset->n : 0;
+}
+
+/**
+ * tm_cvsx_get - get CVSX registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy from.
+ * @ubuf:	User buffer to copy into.
+ *
+ * This function gets in transaction checkpointed VSX registers.
+ *
+ * When the transaction is active 'ckfp_state' holds the checkpointed
+ * values for the current transaction to fall back on if it aborts
+ * in between. This function gets those checkpointed VSX registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *	u64	vsx[32];
+ *};
+ */
+static int tm_cvsx_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf)
+{
+	u64 buf[32];
+	int ret, i;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	/* Flush the state */
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+	flush_vsx_to_thread(target);
+
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  buf, 0, 32 * sizeof(double));
+
+	return ret;
+}
+
+/**
+ * tm_cvsx_set - set CFPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy into.
+ * @ubuf:	User buffer to copy from.
+ *
+ * This function sets in transaction checkpointed VSX registers.
+ *
+ * When the transaction is active 'ckfp_state' holds the checkpointed
+ * VSX register values for the current transaction to fall back on
+ * if it aborts in between. This function sets these checkpointed
+ * FPR registers. The userspace interface buffer layout is as follows.
+ *
+ * struct data {
+ *	u64	vsx[32];
+ *};
+ */
+static int tm_cvsx_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
+{
+	u64 buf[32];
+	int ret, i;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	/* Flush the state */
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+	flush_vsx_to_thread(target);
+
+	for (i = 0; i < 32 ; i++)
+		buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 buf, 0, 32 * sizeof(double));
+	if (!ret)
+		for (i = 0; i < 32 ; i++)
+			target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+
+	return ret;
+}
+
+/**
+ * tm_spr_active - get active number of registers in TM SPR
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ *
+ * This function checks the active number of available
+ * regisers in the transactional memory SPR category.
+ */
+static int tm_spr_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	return regset->n;
+}
+
+/**
+ * tm_spr_get - get the TM related SPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy from.
+ * @ubuf:	User buffer to copy into.
+ *
+ * This function gets transactional memory related SPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct {
+ *	u64		tm_tfhar;
+ *	u64		tm_texasr;
+ *	u64		tm_tfiar;
+ * };
+ */
+static int tm_spr_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	/* Build tests */
+	BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr));
+	BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar));
+	BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs));
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	/* Flush the states */
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	/* TFHAR register */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_tfhar, 0, sizeof(u64));
+
+	/* TEXASR register */
+	if (!ret)
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_texasr, sizeof(u64),
+				2 * sizeof(u64));
+
+	/* TFIAR register */
+	if (!ret)
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_tfiar,
+				2 * sizeof(u64), 3 * sizeof(u64));
+	return ret;
+}
+
+/**
+ * tm_spr_set - set the TM related SPR registers
+ * @target:	The target task.
+ * @regset:	The user regset structure.
+ * @pos:	The buffer position.
+ * @count:	Number of bytes to copy.
+ * @kbuf:	Kernel buffer to copy into.
+ * @ubuf:	User buffer to copy from.
+ *
+ * This function sets transactional memory related SPR registers.
+ * The userspace interface buffer layout is as follows.
+ *
+ * struct {
+ *	u64		tm_tfhar;
+ *	u64		tm_texasr;
+ *	u64		tm_tfiar;
+ * };
+ */
+static int tm_spr_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	/* Build tests */
+	BUILD_BUG_ON(TSO(tm_tfhar) + sizeof(u64) != TSO(tm_texasr));
+	BUILD_BUG_ON(TSO(tm_texasr) + sizeof(u64) != TSO(tm_tfiar));
+	BUILD_BUG_ON(TSO(tm_tfiar) + sizeof(u64) != TSO(ckpt_regs));
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	/* Flush the states */
+	flush_tmregs_to_thread(target);
+	flush_fp_to_thread(target);
+	flush_altivec_to_thread(target);
+
+	/* TFHAR register */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_tfhar, 0, sizeof(u64));
+
+	/* TEXASR register */
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_texasr, sizeof(u64),
+				2 * sizeof(u64));
+
+	/* TFIAR register */
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_tfiar,
+				 2 * sizeof(u64), 3 * sizeof(u64));
+	return ret;
+}
+
+static int tm_tar_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (MSR_TM_ACTIVE(target->thread.regs->msr))
+		return regset->n;
+
+	return 0;
+}
+
+static int tm_tar_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_tar, 0, sizeof(u64));
+	return ret;
+}
+
+static int tm_tar_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_tar, 0, sizeof(u64));
+	return ret;
+}
+
+static int tm_ppr_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (MSR_TM_ACTIVE(target->thread.regs->msr))
+		return regset->n;
+
+	return 0;
+}
+
+
+static int tm_ppr_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_ppr, 0, sizeof(u64));
+	return ret;
+}
+
+static int tm_ppr_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_ppr, 0, sizeof(u64));
+	return ret;
+}
+
+static int tm_dscr_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (MSR_TM_ACTIVE(target->thread.regs->msr))
+		return regset->n;
+
+	return 0;
+}
+
+static int tm_dscr_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_dscr, 0, sizeof(u64));
+	return ret;
+}
+
+static int tm_dscr_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_feature(CPU_FTR_TM))
+		return -ENODEV;
+
+	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
+		return -ENODATA;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				&target->thread.tm_dscr, 0, sizeof(u64));
+	return ret;
+}
+#endif	/* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+#ifdef CONFIG_PPC64
+static int ppr_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.ppr, 0, sizeof(u64));
+}
+
+static int ppr_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.ppr, 0, sizeof(u64));
+}
+
+static int dscr_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.dscr, 0, sizeof(u64));
+}
+static int dscr_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.dscr, 0, sizeof(u64));
+}
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+static int tar_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.tar, 0, sizeof(u64));
+}
+static int tar_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.tar, 0, sizeof(u64));
+}
+
+static int ebb_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	if (target->thread.used_ebb)
+		return regset->n;
+
+	return 0;
+}
+
+static int ebb_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	/* Build tests */
+	BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr));
+	BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr));
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	if (!target->thread.used_ebb)
+		return -ENODATA;
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+			&target->thread.ebbrr, 0, 3 * sizeof(unsigned long));
+}
+
+static int ebb_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	int ret = 0;
+
+	/* Build tests */
+	BUILD_BUG_ON(TSO(ebbrr) + sizeof(unsigned long) != TSO(ebbhr));
+	BUILD_BUG_ON(TSO(ebbhr) + sizeof(unsigned long) != TSO(bescr));
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	if (target->thread.used_ebb)
+		return -ENODATA;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.ebbrr, 0, sizeof(unsigned long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.ebbhr, sizeof(unsigned long),
+			2 * sizeof(unsigned long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.bescr,
+			2 * sizeof(unsigned long), 3 * sizeof(unsigned long));
+
+	return ret;
+}
+static int pmu_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	return regset->n;
+}
+
+static int pmu_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
+{
+	/* Build tests */
+	BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar));
+	BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier));
+	BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2));
+	BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0));
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+			&target->thread.siar, 0,
+			5 * sizeof(unsigned long));
+}
+
+static int pmu_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	int ret = 0;
+
+	/* Build tests */
+	BUILD_BUG_ON(TSO(siar) + sizeof(unsigned long) != TSO(sdar));
+	BUILD_BUG_ON(TSO(sdar) + sizeof(unsigned long) != TSO(sier));
+	BUILD_BUG_ON(TSO(sier) + sizeof(unsigned long) != TSO(mmcr2));
+	BUILD_BUG_ON(TSO(mmcr2) + sizeof(unsigned long) != TSO(mmcr0));
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.siar, 0,
+			sizeof(unsigned long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.sdar, sizeof(unsigned long),
+			2 * sizeof(unsigned long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.sier, 2 * sizeof(unsigned long),
+			3 * sizeof(unsigned long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.mmcr2, 3 * sizeof(unsigned long),
+			4 * sizeof(unsigned long));
+
+	if (!ret)
+		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+			&target->thread.mmcr0, 4 * sizeof(unsigned long),
+			5 * sizeof(unsigned long));
+	return ret;
+}
+#endif
+
+#ifdef CONFIG_PPC_MEM_KEYS
+static int pkey_active(struct task_struct *target,
+		       const struct user_regset *regset)
+{
+	if (!arch_pkeys_enabled())
+		return -ENODEV;
+
+	return regset->n;
+}
+
+static int pkey_get(struct task_struct *target,
+		    const struct user_regset *regset,
+		    unsigned int pos, unsigned int count,
+		    void *kbuf, void __user *ubuf)
+{
+	BUILD_BUG_ON(TSO(amr) + sizeof(unsigned long) != TSO(iamr));
+	BUILD_BUG_ON(TSO(iamr) + sizeof(unsigned long) != TSO(uamor));
+
+	if (!arch_pkeys_enabled())
+		return -ENODEV;
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.amr, 0,
+				   ELF_NPKEY * sizeof(unsigned long));
+}
+
+static int pkey_set(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      const void *kbuf, const void __user *ubuf)
+{
+	u64 new_amr;
+	int ret;
+
+	if (!arch_pkeys_enabled())
+		return -ENODEV;
+
+	/* Only the AMR can be set from userspace */
+	if (pos != 0 || count != sizeof(new_amr))
+		return -EINVAL;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &new_amr, 0, sizeof(new_amr));
+	if (ret)
+		return ret;
+
+	/* UAMOR determines which bits of the AMR can be set from userspace. */
+	target->thread.amr = (new_amr & target->thread.uamor) |
+		(target->thread.amr & ~target->thread.uamor);
+
+	return 0;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
+/*
+ * These are our native regset flavors.
+ */
+enum powerpc_regset {
+	REGSET_GPR,
+	REGSET_FPR,
+#ifdef CONFIG_ALTIVEC
+	REGSET_VMX,
+#endif
+#ifdef CONFIG_VSX
+	REGSET_VSX,
+#endif
+#ifdef CONFIG_SPE
+	REGSET_SPE,
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	REGSET_TM_CGPR,		/* TM checkpointed GPR registers */
+	REGSET_TM_CFPR,		/* TM checkpointed FPR registers */
+	REGSET_TM_CVMX,		/* TM checkpointed VMX registers */
+	REGSET_TM_CVSX,		/* TM checkpointed VSX registers */
+	REGSET_TM_SPR,		/* TM specific SPR registers */
+	REGSET_TM_CTAR,		/* TM checkpointed TAR register */
+	REGSET_TM_CPPR,		/* TM checkpointed PPR register */
+	REGSET_TM_CDSCR,	/* TM checkpointed DSCR register */
+#endif
+#ifdef CONFIG_PPC64
+	REGSET_PPR,		/* PPR register */
+	REGSET_DSCR,		/* DSCR register */
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	REGSET_TAR,		/* TAR register */
+	REGSET_EBB,		/* EBB registers */
+	REGSET_PMR,		/* Performance Monitor Registers */
+#endif
+#ifdef CONFIG_PPC_MEM_KEYS
+	REGSET_PKEY,		/* AMR register */
+#endif
+};
+
+static const struct user_regset native_regsets[] = {
+	[REGSET_GPR] = {
+		.core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+		.size = sizeof(long), .align = sizeof(long),
+		.get = gpr_get, .set = gpr_set
+	},
+	[REGSET_FPR] = {
+		.core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+		.size = sizeof(double), .align = sizeof(double),
+		.get = fpr_get, .set = fpr_set
+	},
+#ifdef CONFIG_ALTIVEC
+	[REGSET_VMX] = {
+		.core_note_type = NT_PPC_VMX, .n = 34,
+		.size = sizeof(vector128), .align = sizeof(vector128),
+		.active = vr_active, .get = vr_get, .set = vr_set
+	},
+#endif
+#ifdef CONFIG_VSX
+	[REGSET_VSX] = {
+		.core_note_type = NT_PPC_VSX, .n = 32,
+		.size = sizeof(double), .align = sizeof(double),
+		.active = vsr_active, .get = vsr_get, .set = vsr_set
+	},
+#endif
+#ifdef CONFIG_SPE
+	[REGSET_SPE] = {
+		.core_note_type = NT_PPC_SPE, .n = 35,
+		.size = sizeof(u32), .align = sizeof(u32),
+		.active = evr_active, .get = evr_get, .set = evr_set
+	},
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	[REGSET_TM_CGPR] = {
+		.core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG,
+		.size = sizeof(long), .align = sizeof(long),
+		.active = tm_cgpr_active, .get = tm_cgpr_get, .set = tm_cgpr_set
+	},
+	[REGSET_TM_CFPR] = {
+		.core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG,
+		.size = sizeof(double), .align = sizeof(double),
+		.active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set
+	},
+	[REGSET_TM_CVMX] = {
+		.core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX,
+		.size = sizeof(vector128), .align = sizeof(vector128),
+		.active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set
+	},
+	[REGSET_TM_CVSX] = {
+		.core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX,
+		.size = sizeof(double), .align = sizeof(double),
+		.active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set
+	},
+	[REGSET_TM_SPR] = {
+		.core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set
+	},
+	[REGSET_TM_CTAR] = {
+		.core_note_type = NT_PPC_TM_CTAR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set
+	},
+	[REGSET_TM_CPPR] = {
+		.core_note_type = NT_PPC_TM_CPPR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set
+	},
+	[REGSET_TM_CDSCR] = {
+		.core_note_type = NT_PPC_TM_CDSCR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set
+	},
+#endif
+#ifdef CONFIG_PPC64
+	[REGSET_PPR] = {
+		.core_note_type = NT_PPC_PPR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.get = ppr_get, .set = ppr_set
+	},
+	[REGSET_DSCR] = {
+		.core_note_type = NT_PPC_DSCR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.get = dscr_get, .set = dscr_set
+	},
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	[REGSET_TAR] = {
+		.core_note_type = NT_PPC_TAR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.get = tar_get, .set = tar_set
+	},
+	[REGSET_EBB] = {
+		.core_note_type = NT_PPC_EBB, .n = ELF_NEBB,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = ebb_active, .get = ebb_get, .set = ebb_set
+	},
+	[REGSET_PMR] = {
+		.core_note_type = NT_PPC_PMU, .n = ELF_NPMU,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = pmu_active, .get = pmu_get, .set = pmu_set
+	},
+#endif
+#ifdef CONFIG_PPC_MEM_KEYS
+	[REGSET_PKEY] = {
+		.core_note_type = NT_PPC_PKEY, .n = ELF_NPKEY,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = pkey_active, .get = pkey_get, .set = pkey_set
+	},
+#endif
+};
+
+static const struct user_regset_view user_ppc_native_view = {
+	.name = UTS_MACHINE, .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI,
+	.regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
+};
+
+#ifdef CONFIG_PPC64
+#include <linux/compat.h>
+
+static int gpr32_get_common(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+			    void *kbuf, void __user *ubuf,
+			    unsigned long *regs)
+{
+	compat_ulong_t *k = kbuf;
+	compat_ulong_t __user *u = ubuf;
+	compat_ulong_t reg;
+
+	pos /= sizeof(reg);
+	count /= sizeof(reg);
+
+	if (kbuf)
+		for (; count > 0 && pos < PT_MSR; --count)
+			*k++ = regs[pos++];
+	else
+		for (; count > 0 && pos < PT_MSR; --count)
+			if (__put_user((compat_ulong_t) regs[pos++], u++))
+				return -EFAULT;
+
+	if (count > 0 && pos == PT_MSR) {
+		reg = get_user_msr(target);
+		if (kbuf)
+			*k++ = reg;
+		else if (__put_user(reg, u++))
+			return -EFAULT;
+		++pos;
+		--count;
+	}
+
+	if (kbuf)
+		for (; count > 0 && pos < PT_REGS_COUNT; --count)
+			*k++ = regs[pos++];
+	else
+		for (; count > 0 && pos < PT_REGS_COUNT; --count)
+			if (__put_user((compat_ulong_t) regs[pos++], u++))
+				return -EFAULT;
+
+	kbuf = k;
+	ubuf = u;
+	pos *= sizeof(reg);
+	count *= sizeof(reg);
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					PT_REGS_COUNT * sizeof(reg), -1);
+}
+
+static int gpr32_set_common(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf,
+		     unsigned long *regs)
+{
+	const compat_ulong_t *k = kbuf;
+	const compat_ulong_t __user *u = ubuf;
+	compat_ulong_t reg;
+
+	pos /= sizeof(reg);
+	count /= sizeof(reg);
+
+	if (kbuf)
+		for (; count > 0 && pos < PT_MSR; --count)
+			regs[pos++] = *k++;
+	else
+		for (; count > 0 && pos < PT_MSR; --count) {
+			if (__get_user(reg, u++))
+				return -EFAULT;
+			regs[pos++] = reg;
+		}
+
+
+	if (count > 0 && pos == PT_MSR) {
+		if (kbuf)
+			reg = *k++;
+		else if (__get_user(reg, u++))
+			return -EFAULT;
+		set_user_msr(target, reg);
+		++pos;
+		--count;
+	}
+
+	if (kbuf) {
+		for (; count > 0 && pos <= PT_MAX_PUT_REG; --count)
+			regs[pos++] = *k++;
+		for (; count > 0 && pos < PT_TRAP; --count, ++pos)
+			++k;
+	} else {
+		for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) {
+			if (__get_user(reg, u++))
+				return -EFAULT;
+			regs[pos++] = reg;
+		}
+		for (; count > 0 && pos < PT_TRAP; --count, ++pos)
+			if (__get_user(reg, u++))
+				return -EFAULT;
+	}
+
+	if (count > 0 && pos == PT_TRAP) {
+		if (kbuf)
+			reg = *k++;
+		else if (__get_user(reg, u++))
+			return -EFAULT;
+		set_user_trap(target, reg);
+		++pos;
+		--count;
+	}
+
+	kbuf = k;
+	ubuf = u;
+	pos *= sizeof(reg);
+	count *= sizeof(reg);
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					 (PT_TRAP + 1) * sizeof(reg), -1);
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static int tm_cgpr32_get(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     void *kbuf, void __user *ubuf)
+{
+	return gpr32_get_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.ckpt_regs.gpr[0]);
+}
+
+static int tm_cgpr32_set(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf)
+{
+	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.ckpt_regs.gpr[0]);
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+static int gpr32_get(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     void *kbuf, void __user *ubuf)
+{
+	int i;
+
+	if (target->thread.regs == NULL)
+		return -EIO;
+
+	if (!FULL_REGS(target->thread.regs)) {
+		/*
+		 * We have a partial register set.
+		 * Fill 14-31 with bogus values.
+		 */
+		for (i = 14; i < 32; i++)
+			target->thread.regs->gpr[i] = NV_REG_POISON;
+	}
+	return gpr32_get_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.regs->gpr[0]);
+}
+
+static int gpr32_set(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int pos, unsigned int count,
+		     const void *kbuf, const void __user *ubuf)
+{
+	if (target->thread.regs == NULL)
+		return -EIO;
+
+	CHECK_FULL_REGS(target->thread.regs);
+	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.regs->gpr[0]);
+}
+
+/*
+ * These are the regset flavors matching the CONFIG_PPC32 native set.
+ */
+static const struct user_regset compat_regsets[] = {
+	[REGSET_GPR] = {
+		.core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+		.size = sizeof(compat_long_t), .align = sizeof(compat_long_t),
+		.get = gpr32_get, .set = gpr32_set
+	},
+	[REGSET_FPR] = {
+		.core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+		.size = sizeof(double), .align = sizeof(double),
+		.get = fpr_get, .set = fpr_set
+	},
+#ifdef CONFIG_ALTIVEC
+	[REGSET_VMX] = {
+		.core_note_type = NT_PPC_VMX, .n = 34,
+		.size = sizeof(vector128), .align = sizeof(vector128),
+		.active = vr_active, .get = vr_get, .set = vr_set
+	},
+#endif
+#ifdef CONFIG_SPE
+	[REGSET_SPE] = {
+		.core_note_type = NT_PPC_SPE, .n = 35,
+		.size = sizeof(u32), .align = sizeof(u32),
+		.active = evr_active, .get = evr_get, .set = evr_set
+	},
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	[REGSET_TM_CGPR] = {
+		.core_note_type = NT_PPC_TM_CGPR, .n = ELF_NGREG,
+		.size = sizeof(long), .align = sizeof(long),
+		.active = tm_cgpr_active,
+		.get = tm_cgpr32_get, .set = tm_cgpr32_set
+	},
+	[REGSET_TM_CFPR] = {
+		.core_note_type = NT_PPC_TM_CFPR, .n = ELF_NFPREG,
+		.size = sizeof(double), .align = sizeof(double),
+		.active = tm_cfpr_active, .get = tm_cfpr_get, .set = tm_cfpr_set
+	},
+	[REGSET_TM_CVMX] = {
+		.core_note_type = NT_PPC_TM_CVMX, .n = ELF_NVMX,
+		.size = sizeof(vector128), .align = sizeof(vector128),
+		.active = tm_cvmx_active, .get = tm_cvmx_get, .set = tm_cvmx_set
+	},
+	[REGSET_TM_CVSX] = {
+		.core_note_type = NT_PPC_TM_CVSX, .n = ELF_NVSX,
+		.size = sizeof(double), .align = sizeof(double),
+		.active = tm_cvsx_active, .get = tm_cvsx_get, .set = tm_cvsx_set
+	},
+	[REGSET_TM_SPR] = {
+		.core_note_type = NT_PPC_TM_SPR, .n = ELF_NTMSPRREG,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_spr_active, .get = tm_spr_get, .set = tm_spr_set
+	},
+	[REGSET_TM_CTAR] = {
+		.core_note_type = NT_PPC_TM_CTAR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_tar_active, .get = tm_tar_get, .set = tm_tar_set
+	},
+	[REGSET_TM_CPPR] = {
+		.core_note_type = NT_PPC_TM_CPPR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_ppr_active, .get = tm_ppr_get, .set = tm_ppr_set
+	},
+	[REGSET_TM_CDSCR] = {
+		.core_note_type = NT_PPC_TM_CDSCR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = tm_dscr_active, .get = tm_dscr_get, .set = tm_dscr_set
+	},
+#endif
+#ifdef CONFIG_PPC64
+	[REGSET_PPR] = {
+		.core_note_type = NT_PPC_PPR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.get = ppr_get, .set = ppr_set
+	},
+	[REGSET_DSCR] = {
+		.core_note_type = NT_PPC_DSCR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.get = dscr_get, .set = dscr_set
+	},
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	[REGSET_TAR] = {
+		.core_note_type = NT_PPC_TAR, .n = 1,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.get = tar_get, .set = tar_set
+	},
+	[REGSET_EBB] = {
+		.core_note_type = NT_PPC_EBB, .n = ELF_NEBB,
+		.size = sizeof(u64), .align = sizeof(u64),
+		.active = ebb_active, .get = ebb_get, .set = ebb_set
+	},
+#endif
+};
+
+static const struct user_regset_view user_ppc_compat_view = {
+	.name = "ppc", .e_machine = EM_PPC, .ei_osabi = ELF_OSABI,
+	.regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets)
+};
+#endif	/* CONFIG_PPC64 */
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+#ifdef CONFIG_PPC64
+	if (test_tsk_thread_flag(task, TIF_32BIT))
+		return &user_ppc_compat_view;
+#endif
+	return &user_ppc_native_view;
+}
+
+
+void user_enable_single_step(struct task_struct *task)
+{
+	struct pt_regs *regs = task->thread.regs;
+
+	if (regs != NULL) {
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+		task->thread.debug.dbcr0 &= ~DBCR0_BT;
+		task->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+		regs->msr |= MSR_DE;
+#else
+		regs->msr &= ~MSR_BE;
+		regs->msr |= MSR_SE;
+#endif
+	}
+	set_tsk_thread_flag(task, TIF_SINGLESTEP);
+}
+
+void user_enable_block_step(struct task_struct *task)
+{
+	struct pt_regs *regs = task->thread.regs;
+
+	if (regs != NULL) {
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+		task->thread.debug.dbcr0 &= ~DBCR0_IC;
+		task->thread.debug.dbcr0 = DBCR0_IDM | DBCR0_BT;
+		regs->msr |= MSR_DE;
+#else
+		regs->msr &= ~MSR_SE;
+		regs->msr |= MSR_BE;
+#endif
+	}
+	set_tsk_thread_flag(task, TIF_SINGLESTEP);
+}
+
+void user_disable_single_step(struct task_struct *task)
+{
+	struct pt_regs *regs = task->thread.regs;
+
+	if (regs != NULL) {
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+		/*
+		 * The logic to disable single stepping should be as
+		 * simple as turning off the Instruction Complete flag.
+		 * And, after doing so, if all debug flags are off, turn
+		 * off DBCR0(IDM) and MSR(DE) .... Torez
+		 */
+		task->thread.debug.dbcr0 &= ~(DBCR0_IC|DBCR0_BT);
+		/*
+		 * Test to see if any of the DBCR_ACTIVE_EVENTS bits are set.
+		 */
+		if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0,
+					task->thread.debug.dbcr1)) {
+			/*
+			 * All debug events were off.....
+			 */
+			task->thread.debug.dbcr0 &= ~DBCR0_IDM;
+			regs->msr &= ~MSR_DE;
+		}
+#else
+		regs->msr &= ~(MSR_SE | MSR_BE);
+#endif
+	}
+	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
+}
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void ptrace_triggered(struct perf_event *bp,
+		      struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct perf_event_attr attr;
+
+	/*
+	 * Disable the breakpoint request here since ptrace has defined a
+	 * one-shot behaviour for breakpoint exceptions in PPC64.
+	 * The SIGTRAP signal is generated automatically for us in do_dabr().
+	 * We don't have to do anything about that here
+	 */
+	attr = bp->attr;
+	attr.disabled = true;
+	modify_user_hw_breakpoint(bp, &attr);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
+static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
+			       unsigned long data)
+{
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	int ret;
+	struct thread_struct *thread = &(task->thread);
+	struct perf_event *bp;
+	struct perf_event_attr attr;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+	bool set_bp = true;
+	struct arch_hw_breakpoint hw_brk;
+#endif
+
+	/* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
+	 *  For embedded processors we support one DAC and no IAC's at the
+	 *  moment.
+	 */
+	if (addr > 0)
+		return -EINVAL;
+
+	/* The bottom 3 bits in dabr are flags */
+	if ((data & ~0x7UL) >= TASK_SIZE)
+		return -EIO;
+
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+	/* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
+	 *  It was assumed, on previous implementations, that 3 bits were
+	 *  passed together with the data address, fitting the design of the
+	 *  DABR register, as follows:
+	 *
+	 *  bit 0: Read flag
+	 *  bit 1: Write flag
+	 *  bit 2: Breakpoint translation
+	 *
+	 *  Thus, we use them here as so.
+	 */
+
+	/* Ensure breakpoint translation bit is set */
+	if (data && !(data & HW_BRK_TYPE_TRANSLATE))
+		return -EIO;
+	hw_brk.address = data & (~HW_BRK_TYPE_DABR);
+	hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL;
+	hw_brk.len = 8;
+	set_bp = (data) && (hw_brk.type & HW_BRK_TYPE_RDWR);
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	bp = thread->ptrace_bps[0];
+	if (!set_bp) {
+		if (bp) {
+			unregister_hw_breakpoint(bp);
+			thread->ptrace_bps[0] = NULL;
+		}
+		return 0;
+	}
+	if (bp) {
+		attr = bp->attr;
+		attr.bp_addr = hw_brk.address;
+		arch_bp_generic_fields(hw_brk.type, &attr.bp_type);
+
+		/* Enable breakpoint */
+		attr.disabled = false;
+
+		ret =  modify_user_hw_breakpoint(bp, &attr);
+		if (ret) {
+			return ret;
+		}
+		thread->ptrace_bps[0] = bp;
+		thread->hw_brk = hw_brk;
+		return 0;
+	}
+
+	/* Create a new breakpoint request if one doesn't exist already */
+	hw_breakpoint_init(&attr);
+	attr.bp_addr = hw_brk.address;
+	attr.bp_len = 8;
+	arch_bp_generic_fields(hw_brk.type,
+			       &attr.bp_type);
+
+	thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
+					       ptrace_triggered, NULL, task);
+	if (IS_ERR(bp)) {
+		thread->ptrace_bps[0] = NULL;
+		return PTR_ERR(bp);
+	}
+
+#else /* !CONFIG_HAVE_HW_BREAKPOINT */
+	if (set_bp && (!ppc_breakpoint_available()))
+		return -ENODEV;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+	task->thread.hw_brk = hw_brk;
+#else /* CONFIG_PPC_ADV_DEBUG_REGS */
+	/* As described above, it was assumed 3 bits were passed with the data
+	 *  address, but we will assume only the mode bits will be passed
+	 *  as to not cause alignment restrictions for DAC-based processors.
+	 */
+
+	/* DAC's hold the whole address without any mode flags */
+	task->thread.debug.dac1 = data & ~0x3UL;
+
+	if (task->thread.debug.dac1 == 0) {
+		dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W);
+		if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0,
+					task->thread.debug.dbcr1)) {
+			task->thread.regs->msr &= ~MSR_DE;
+			task->thread.debug.dbcr0 &= ~DBCR0_IDM;
+		}
+		return 0;
+	}
+
+	/* Read or Write bits must be set */
+
+	if (!(data & 0x3UL))
+		return -EINVAL;
+
+	/* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
+	   register */
+	task->thread.debug.dbcr0 |= DBCR0_IDM;
+
+	/* Check for write and read flags and set DBCR0
+	   accordingly */
+	dbcr_dac(task) &= ~(DBCR_DAC1R|DBCR_DAC1W);
+	if (data & 0x1UL)
+		dbcr_dac(task) |= DBCR_DAC1R;
+	if (data & 0x2UL)
+		dbcr_dac(task) |= DBCR_DAC1W;
+	task->thread.regs->msr |= MSR_DE;
+#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
+	return 0;
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure single step bits etc are not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+	/* make sure the single step bit is not set. */
+	user_disable_single_step(child);
+}
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+static long set_instruction_bp(struct task_struct *child,
+			      struct ppc_hw_breakpoint *bp_info)
+{
+	int slot;
+	int slot1_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC1) != 0);
+	int slot2_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC2) != 0);
+	int slot3_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC3) != 0);
+	int slot4_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC4) != 0);
+
+	if (dbcr_iac_range(child) & DBCR_IAC12MODE)
+		slot2_in_use = 1;
+	if (dbcr_iac_range(child) & DBCR_IAC34MODE)
+		slot4_in_use = 1;
+
+	if (bp_info->addr >= TASK_SIZE)
+		return -EIO;
+
+	if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) {
+
+		/* Make sure range is valid. */
+		if (bp_info->addr2 >= TASK_SIZE)
+			return -EIO;
+
+		/* We need a pair of IAC regsisters */
+		if ((!slot1_in_use) && (!slot2_in_use)) {
+			slot = 1;
+			child->thread.debug.iac1 = bp_info->addr;
+			child->thread.debug.iac2 = bp_info->addr2;
+			child->thread.debug.dbcr0 |= DBCR0_IAC1;
+			if (bp_info->addr_mode ==
+					PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE)
+				dbcr_iac_range(child) |= DBCR_IAC12X;
+			else
+				dbcr_iac_range(child) |= DBCR_IAC12I;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+		} else if ((!slot3_in_use) && (!slot4_in_use)) {
+			slot = 3;
+			child->thread.debug.iac3 = bp_info->addr;
+			child->thread.debug.iac4 = bp_info->addr2;
+			child->thread.debug.dbcr0 |= DBCR0_IAC3;
+			if (bp_info->addr_mode ==
+					PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE)
+				dbcr_iac_range(child) |= DBCR_IAC34X;
+			else
+				dbcr_iac_range(child) |= DBCR_IAC34I;
+#endif
+		} else
+			return -ENOSPC;
+	} else {
+		/* We only need one.  If possible leave a pair free in
+		 * case a range is needed later
+		 */
+		if (!slot1_in_use) {
+			/*
+			 * Don't use iac1 if iac1-iac2 are free and either
+			 * iac3 or iac4 (but not both) are free
+			 */
+			if (slot2_in_use || (slot3_in_use == slot4_in_use)) {
+				slot = 1;
+				child->thread.debug.iac1 = bp_info->addr;
+				child->thread.debug.dbcr0 |= DBCR0_IAC1;
+				goto out;
+			}
+		}
+		if (!slot2_in_use) {
+			slot = 2;
+			child->thread.debug.iac2 = bp_info->addr;
+			child->thread.debug.dbcr0 |= DBCR0_IAC2;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+		} else if (!slot3_in_use) {
+			slot = 3;
+			child->thread.debug.iac3 = bp_info->addr;
+			child->thread.debug.dbcr0 |= DBCR0_IAC3;
+		} else if (!slot4_in_use) {
+			slot = 4;
+			child->thread.debug.iac4 = bp_info->addr;
+			child->thread.debug.dbcr0 |= DBCR0_IAC4;
+#endif
+		} else
+			return -ENOSPC;
+	}
+out:
+	child->thread.debug.dbcr0 |= DBCR0_IDM;
+	child->thread.regs->msr |= MSR_DE;
+
+	return slot;
+}
+
+static int del_instruction_bp(struct task_struct *child, int slot)
+{
+	switch (slot) {
+	case 1:
+		if ((child->thread.debug.dbcr0 & DBCR0_IAC1) == 0)
+			return -ENOENT;
+
+		if (dbcr_iac_range(child) & DBCR_IAC12MODE) {
+			/* address range - clear slots 1 & 2 */
+			child->thread.debug.iac2 = 0;
+			dbcr_iac_range(child) &= ~DBCR_IAC12MODE;
+		}
+		child->thread.debug.iac1 = 0;
+		child->thread.debug.dbcr0 &= ~DBCR0_IAC1;
+		break;
+	case 2:
+		if ((child->thread.debug.dbcr0 & DBCR0_IAC2) == 0)
+			return -ENOENT;
+
+		if (dbcr_iac_range(child) & DBCR_IAC12MODE)
+			/* used in a range */
+			return -EINVAL;
+		child->thread.debug.iac2 = 0;
+		child->thread.debug.dbcr0 &= ~DBCR0_IAC2;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case 3:
+		if ((child->thread.debug.dbcr0 & DBCR0_IAC3) == 0)
+			return -ENOENT;
+
+		if (dbcr_iac_range(child) & DBCR_IAC34MODE) {
+			/* address range - clear slots 3 & 4 */
+			child->thread.debug.iac4 = 0;
+			dbcr_iac_range(child) &= ~DBCR_IAC34MODE;
+		}
+		child->thread.debug.iac3 = 0;
+		child->thread.debug.dbcr0 &= ~DBCR0_IAC3;
+		break;
+	case 4:
+		if ((child->thread.debug.dbcr0 & DBCR0_IAC4) == 0)
+			return -ENOENT;
+
+		if (dbcr_iac_range(child) & DBCR_IAC34MODE)
+			/* Used in a range */
+			return -EINVAL;
+		child->thread.debug.iac4 = 0;
+		child->thread.debug.dbcr0 &= ~DBCR0_IAC4;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info)
+{
+	int byte_enable =
+		(bp_info->condition_mode >> PPC_BREAKPOINT_CONDITION_BE_SHIFT)
+		& 0xf;
+	int condition_mode =
+		bp_info->condition_mode & PPC_BREAKPOINT_CONDITION_MODE;
+	int slot;
+
+	if (byte_enable && (condition_mode == 0))
+		return -EINVAL;
+
+	if (bp_info->addr >= TASK_SIZE)
+		return -EIO;
+
+	if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) {
+		slot = 1;
+		if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
+			dbcr_dac(child) |= DBCR_DAC1R;
+		if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
+			dbcr_dac(child) |= DBCR_DAC1W;
+		child->thread.debug.dac1 = (unsigned long)bp_info->addr;
+#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
+		if (byte_enable) {
+			child->thread.debug.dvc1 =
+				(unsigned long)bp_info->condition_value;
+			child->thread.debug.dbcr2 |=
+				((byte_enable << DBCR2_DVC1BE_SHIFT) |
+				 (condition_mode << DBCR2_DVC1M_SHIFT));
+		}
+#endif
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+	} else if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) {
+		/* Both dac1 and dac2 are part of a range */
+		return -ENOSPC;
+#endif
+	} else if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) {
+		slot = 2;
+		if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
+			dbcr_dac(child) |= DBCR_DAC2R;
+		if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
+			dbcr_dac(child) |= DBCR_DAC2W;
+		child->thread.debug.dac2 = (unsigned long)bp_info->addr;
+#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
+		if (byte_enable) {
+			child->thread.debug.dvc2 =
+				(unsigned long)bp_info->condition_value;
+			child->thread.debug.dbcr2 |=
+				((byte_enable << DBCR2_DVC2BE_SHIFT) |
+				 (condition_mode << DBCR2_DVC2M_SHIFT));
+		}
+#endif
+	} else
+		return -ENOSPC;
+	child->thread.debug.dbcr0 |= DBCR0_IDM;
+	child->thread.regs->msr |= MSR_DE;
+
+	return slot + 4;
+}
+
+static int del_dac(struct task_struct *child, int slot)
+{
+	if (slot == 1) {
+		if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0)
+			return -ENOENT;
+
+		child->thread.debug.dac1 = 0;
+		dbcr_dac(child) &= ~(DBCR_DAC1R | DBCR_DAC1W);
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+		if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) {
+			child->thread.debug.dac2 = 0;
+			child->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE;
+		}
+		child->thread.debug.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE);
+#endif
+#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
+		child->thread.debug.dvc1 = 0;
+#endif
+	} else if (slot == 2) {
+		if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0)
+			return -ENOENT;
+
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+		if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE)
+			/* Part of a range */
+			return -EINVAL;
+		child->thread.debug.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE);
+#endif
+#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
+		child->thread.debug.dvc2 = 0;
+#endif
+		child->thread.debug.dac2 = 0;
+		dbcr_dac(child) &= ~(DBCR_DAC2R | DBCR_DAC2W);
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
+
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+static int set_dac_range(struct task_struct *child,
+			 struct ppc_hw_breakpoint *bp_info)
+{
+	int mode = bp_info->addr_mode & PPC_BREAKPOINT_MODE_MASK;
+
+	/* We don't allow range watchpoints to be used with DVC */
+	if (bp_info->condition_mode)
+		return -EINVAL;
+
+	/*
+	 * Best effort to verify the address range.  The user/supervisor bits
+	 * prevent trapping in kernel space, but let's fail on an obvious bad
+	 * range.  The simple test on the mask is not fool-proof, and any
+	 * exclusive range will spill over into kernel space.
+	 */
+	if (bp_info->addr >= TASK_SIZE)
+		return -EIO;
+	if (mode == PPC_BREAKPOINT_MODE_MASK) {
+		/*
+		 * dac2 is a bitmask.  Don't allow a mask that makes a
+		 * kernel space address from a valid dac1 value
+		 */
+		if (~((unsigned long)bp_info->addr2) >= TASK_SIZE)
+			return -EIO;
+	} else {
+		/*
+		 * For range breakpoints, addr2 must also be a valid address
+		 */
+		if (bp_info->addr2 >= TASK_SIZE)
+			return -EIO;
+	}
+
+	if (child->thread.debug.dbcr0 &
+	    (DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W))
+		return -ENOSPC;
+
+	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
+		child->thread.debug.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM);
+	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
+		child->thread.debug.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM);
+	child->thread.debug.dac1 = bp_info->addr;
+	child->thread.debug.dac2 = bp_info->addr2;
+	if (mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE)
+		child->thread.debug.dbcr2  |= DBCR2_DAC12M;
+	else if (mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE)
+		child->thread.debug.dbcr2  |= DBCR2_DAC12MX;
+	else	/* PPC_BREAKPOINT_MODE_MASK */
+		child->thread.debug.dbcr2  |= DBCR2_DAC12MM;
+	child->thread.regs->msr |= MSR_DE;
+
+	return 5;
+}
+#endif /* CONFIG_PPC_ADV_DEBUG_DAC_RANGE */
+
+static long ppc_set_hwdebug(struct task_struct *child,
+		     struct ppc_hw_breakpoint *bp_info)
+{
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	int len = 0;
+	struct thread_struct *thread = &(child->thread);
+	struct perf_event *bp;
+	struct perf_event_attr attr;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+	struct arch_hw_breakpoint brk;
+#endif
+
+	if (bp_info->version != 1)
+		return -ENOTSUPP;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	/*
+	 * Check for invalid flags and combinations
+	 */
+	if ((bp_info->trigger_type == 0) ||
+	    (bp_info->trigger_type & ~(PPC_BREAKPOINT_TRIGGER_EXECUTE |
+				       PPC_BREAKPOINT_TRIGGER_RW)) ||
+	    (bp_info->addr_mode & ~PPC_BREAKPOINT_MODE_MASK) ||
+	    (bp_info->condition_mode &
+	     ~(PPC_BREAKPOINT_CONDITION_MODE |
+	       PPC_BREAKPOINT_CONDITION_BE_ALL)))
+		return -EINVAL;
+#if CONFIG_PPC_ADV_DEBUG_DVCS == 0
+	if (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE)
+		return -EINVAL;
+#endif
+
+	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_EXECUTE) {
+		if ((bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_EXECUTE) ||
+		    (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE))
+			return -EINVAL;
+		return set_instruction_bp(child, bp_info);
+	}
+	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+		return set_dac(child, bp_info);
+
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+	return set_dac_range(child, bp_info);
+#else
+	return -EINVAL;
+#endif
+#else /* !CONFIG_PPC_ADV_DEBUG_DVCS */
+	/*
+	 * We only support one data breakpoint
+	 */
+	if ((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0 ||
+	    (bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0 ||
+	    bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE)
+		return -EINVAL;
+
+	if ((unsigned long)bp_info->addr >= TASK_SIZE)
+		return -EIO;
+
+	brk.address = bp_info->addr & ~7UL;
+	brk.type = HW_BRK_TYPE_TRANSLATE;
+	brk.len = 8;
+	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
+		brk.type |= HW_BRK_TYPE_READ;
+	if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
+		brk.type |= HW_BRK_TYPE_WRITE;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	/*
+	 * Check if the request is for 'range' breakpoints. We can
+	 * support it if range < 8 bytes.
+	 */
+	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE)
+		len = bp_info->addr2 - bp_info->addr;
+	else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+		len = 1;
+	else
+		return -EINVAL;
+	bp = thread->ptrace_bps[0];
+	if (bp)
+		return -ENOSPC;
+
+	/* Create a new breakpoint request if one doesn't exist already */
+	hw_breakpoint_init(&attr);
+	attr.bp_addr = (unsigned long)bp_info->addr & ~HW_BREAKPOINT_ALIGN;
+	attr.bp_len = len;
+	arch_bp_generic_fields(brk.type, &attr.bp_type);
+
+	thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
+					       ptrace_triggered, NULL, child);
+	if (IS_ERR(bp)) {
+		thread->ptrace_bps[0] = NULL;
+		return PTR_ERR(bp);
+	}
+
+	return 1;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
+	if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT)
+		return -EINVAL;
+
+	if (child->thread.hw_brk.address)
+		return -ENOSPC;
+
+	if (!ppc_breakpoint_available())
+		return -ENODEV;
+
+	child->thread.hw_brk = brk;
+
+	return 1;
+#endif /* !CONFIG_PPC_ADV_DEBUG_DVCS */
+}
+
+static long ppc_del_hwdebug(struct task_struct *child, long data)
+{
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	int ret = 0;
+	struct thread_struct *thread = &(child->thread);
+	struct perf_event *bp;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	int rc;
+
+	if (data <= 4)
+		rc = del_instruction_bp(child, (int)data);
+	else
+		rc = del_dac(child, (int)data - 4);
+
+	if (!rc) {
+		if (!DBCR_ACTIVE_EVENTS(child->thread.debug.dbcr0,
+					child->thread.debug.dbcr1)) {
+			child->thread.debug.dbcr0 &= ~DBCR0_IDM;
+			child->thread.regs->msr &= ~MSR_DE;
+		}
+	}
+	return rc;
+#else
+	if (data != 1)
+		return -EINVAL;
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	bp = thread->ptrace_bps[0];
+	if (bp) {
+		unregister_hw_breakpoint(bp);
+		thread->ptrace_bps[0] = NULL;
+	} else
+		ret = -ENOENT;
+	return ret;
+#else /* CONFIG_HAVE_HW_BREAKPOINT */
+	if (child->thread.hw_brk.address == 0)
+		return -ENOENT;
+
+	child->thread.hw_brk.address = 0;
+	child->thread.hw_brk.type = 0;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
+	return 0;
+#endif
+}
+
+long arch_ptrace(struct task_struct *child, long request,
+		 unsigned long addr, unsigned long data)
+{
+	int ret = -EPERM;
+	void __user *datavp = (void __user *) data;
+	unsigned long __user *datalp = datavp;
+
+	switch (request) {
+	/* read the word at location addr in the USER area. */
+	case PTRACE_PEEKUSR: {
+		unsigned long index, tmp;
+
+		ret = -EIO;
+		/* convert to index and check */
+#ifdef CONFIG_PPC32
+		index = addr >> 2;
+		if ((addr & 3) || (index > PT_FPSCR)
+		    || (child->thread.regs == NULL))
+#else
+		index = addr >> 3;
+		if ((addr & 7) || (index > PT_FPSCR))
+#endif
+			break;
+
+		CHECK_FULL_REGS(child->thread.regs);
+		if (index < PT_FPR0) {
+			ret = ptrace_get_reg(child, (int) index, &tmp);
+			if (ret)
+				break;
+		} else {
+			unsigned int fpidx = index - PT_FPR0;
+
+			flush_fp_to_thread(child);
+			if (fpidx < (PT_FPSCR - PT_FPR0))
+				if (IS_ENABLED(CONFIG_PPC32)) {
+					// On 32-bit the index we are passed refers to 32-bit words
+					tmp = ((u32 *)child->thread.fp_state.fpr)[fpidx];
+				} else {
+					memcpy(&tmp, &child->thread.TS_FPR(fpidx),
+					       sizeof(long));
+				}
+			else
+				tmp = child->thread.fp_state.fpscr;
+		}
+		ret = put_user(tmp, datalp);
+		break;
+	}
+
+	/* write the word at location addr in the USER area */
+	case PTRACE_POKEUSR: {
+		unsigned long index;
+
+		ret = -EIO;
+		/* convert to index and check */
+#ifdef CONFIG_PPC32
+		index = addr >> 2;
+		if ((addr & 3) || (index > PT_FPSCR)
+		    || (child->thread.regs == NULL))
+#else
+		index = addr >> 3;
+		if ((addr & 7) || (index > PT_FPSCR))
+#endif
+			break;
+
+		CHECK_FULL_REGS(child->thread.regs);
+		if (index < PT_FPR0) {
+			ret = ptrace_put_reg(child, index, data);
+		} else {
+			unsigned int fpidx = index - PT_FPR0;
+
+			flush_fp_to_thread(child);
+			if (fpidx < (PT_FPSCR - PT_FPR0))
+				if (IS_ENABLED(CONFIG_PPC32)) {
+					// On 32-bit the index we are passed refers to 32-bit words
+					((u32 *)child->thread.fp_state.fpr)[fpidx] = data;
+				} else {
+					memcpy(&child->thread.TS_FPR(fpidx), &data,
+					       sizeof(long));
+				}
+			else
+				child->thread.fp_state.fpscr = data;
+			ret = 0;
+		}
+		break;
+	}
+
+	case PPC_PTRACE_GETHWDBGINFO: {
+		struct ppc_debug_info dbginfo;
+
+		dbginfo.version = 1;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+		dbginfo.num_instruction_bps = CONFIG_PPC_ADV_DEBUG_IACS;
+		dbginfo.num_data_bps = CONFIG_PPC_ADV_DEBUG_DACS;
+		dbginfo.num_condition_regs = CONFIG_PPC_ADV_DEBUG_DVCS;
+		dbginfo.data_bp_alignment = 4;
+		dbginfo.sizeof_condition = 4;
+		dbginfo.features = PPC_DEBUG_FEATURE_INSN_BP_RANGE |
+				   PPC_DEBUG_FEATURE_INSN_BP_MASK;
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+		dbginfo.features |=
+				   PPC_DEBUG_FEATURE_DATA_BP_RANGE |
+				   PPC_DEBUG_FEATURE_DATA_BP_MASK;
+#endif
+#else /* !CONFIG_PPC_ADV_DEBUG_REGS */
+		dbginfo.num_instruction_bps = 0;
+		if (ppc_breakpoint_available())
+			dbginfo.num_data_bps = 1;
+		else
+			dbginfo.num_data_bps = 0;
+		dbginfo.num_condition_regs = 0;
+#ifdef CONFIG_PPC64
+		dbginfo.data_bp_alignment = 8;
+#else
+		dbginfo.data_bp_alignment = 4;
+#endif
+		dbginfo.sizeof_condition = 0;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE;
+		if (cpu_has_feature(CPU_FTR_DAWR))
+			dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR;
+#else
+		dbginfo.features = 0;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
+
+		if (copy_to_user(datavp, &dbginfo,
+				 sizeof(struct ppc_debug_info)))
+			return -EFAULT;
+		return 0;
+	}
+
+	case PPC_PTRACE_SETHWDEBUG: {
+		struct ppc_hw_breakpoint bp_info;
+
+		if (copy_from_user(&bp_info, datavp,
+				   sizeof(struct ppc_hw_breakpoint)))
+			return -EFAULT;
+		return ppc_set_hwdebug(child, &bp_info);
+	}
+
+	case PPC_PTRACE_DELHWDEBUG: {
+		ret = ppc_del_hwdebug(child, data);
+		break;
+	}
+
+	case PTRACE_GET_DEBUGREG: {
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+		unsigned long dabr_fake;
+#endif
+		ret = -EINVAL;
+		/* We only support one DABR and no IABRS at the moment */
+		if (addr > 0)
+			break;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+		ret = put_user(child->thread.debug.dac1, datalp);
+#else
+		dabr_fake = ((child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) |
+			     (child->thread.hw_brk.type & HW_BRK_TYPE_DABR));
+		ret = put_user(dabr_fake, datalp);
+#endif
+		break;
+	}
+
+	case PTRACE_SET_DEBUGREG:
+		ret = ptrace_set_debugreg(child, addr, data);
+		break;
+
+#ifdef CONFIG_PPC64
+	case PTRACE_GETREGS64:
+#endif
+	case PTRACE_GETREGS:	/* Get all pt_regs from the child. */
+		return copy_regset_to_user(child, &user_ppc_native_view,
+					   REGSET_GPR,
+					   0, sizeof(struct pt_regs),
+					   datavp);
+
+#ifdef CONFIG_PPC64
+	case PTRACE_SETREGS64:
+#endif
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_ppc_native_view,
+					     REGSET_GPR,
+					     0, sizeof(struct pt_regs),
+					     datavp);
+
+	case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */
+		return copy_regset_to_user(child, &user_ppc_native_view,
+					   REGSET_FPR,
+					   0, sizeof(elf_fpregset_t),
+					   datavp);
+
+	case PTRACE_SETFPREGS: /* Set the child FPU state (FPR0...31 + FPSCR) */
+		return copy_regset_from_user(child, &user_ppc_native_view,
+					     REGSET_FPR,
+					     0, sizeof(elf_fpregset_t),
+					     datavp);
+
+#ifdef CONFIG_ALTIVEC
+	case PTRACE_GETVRREGS:
+		return copy_regset_to_user(child, &user_ppc_native_view,
+					   REGSET_VMX,
+					   0, (33 * sizeof(vector128) +
+					       sizeof(u32)),
+					   datavp);
+
+	case PTRACE_SETVRREGS:
+		return copy_regset_from_user(child, &user_ppc_native_view,
+					     REGSET_VMX,
+					     0, (33 * sizeof(vector128) +
+						 sizeof(u32)),
+					     datavp);
+#endif
+#ifdef CONFIG_VSX
+	case PTRACE_GETVSRREGS:
+		return copy_regset_to_user(child, &user_ppc_native_view,
+					   REGSET_VSX,
+					   0, 32 * sizeof(double),
+					   datavp);
+
+	case PTRACE_SETVSRREGS:
+		return copy_regset_from_user(child, &user_ppc_native_view,
+					     REGSET_VSX,
+					     0, 32 * sizeof(double),
+					     datavp);
+#endif
+#ifdef CONFIG_SPE
+	case PTRACE_GETEVRREGS:
+		/* Get the child spe register state. */
+		return copy_regset_to_user(child, &user_ppc_native_view,
+					   REGSET_SPE, 0, 35 * sizeof(u32),
+					   datavp);
+
+	case PTRACE_SETEVRREGS:
+		/* Set the child spe register state. */
+		return copy_regset_from_user(child, &user_ppc_native_view,
+					     REGSET_SPE, 0, 35 * sizeof(u32),
+					     datavp);
+#endif
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_SECCOMP
+static int do_seccomp(struct pt_regs *regs)
+{
+	if (!test_thread_flag(TIF_SECCOMP))
+		return 0;
+
+	/*
+	 * The ABI we present to seccomp tracers is that r3 contains
+	 * the syscall return value and orig_gpr3 contains the first
+	 * syscall parameter. This is different to the ptrace ABI where
+	 * both r3 and orig_gpr3 contain the first syscall parameter.
+	 */
+	regs->gpr[3] = -ENOSYS;
+
+	/*
+	 * We use the __ version here because we have already checked
+	 * TIF_SECCOMP. If this fails, there is nothing left to do, we
+	 * have already loaded -ENOSYS into r3, or seccomp has put
+	 * something else in r3 (via SECCOMP_RET_ERRNO/TRACE).
+	 */
+	if (__secure_computing(NULL))
+		return -1;
+
+	/*
+	 * The syscall was allowed by seccomp, restore the register
+	 * state to what audit expects.
+	 * Note that we use orig_gpr3, which means a seccomp tracer can
+	 * modify the first syscall parameter (in orig_gpr3) and also
+	 * allow the syscall to proceed.
+	 */
+	regs->gpr[3] = regs->orig_gpr3;
+
+	return 0;
+}
+#else
+static inline int do_seccomp(struct pt_regs *regs) { return 0; }
+#endif /* CONFIG_SECCOMP */
+
+/**
+ * do_syscall_trace_enter() - Do syscall tracing on kernel entry.
+ * @regs: the pt_regs of the task to trace (current)
+ *
+ * Performs various types of tracing on syscall entry. This includes seccomp,
+ * ptrace, syscall tracepoints and audit.
+ *
+ * The pt_regs are potentially visible to userspace via ptrace, so their
+ * contents is ABI.
+ *
+ * One or more of the tracers may modify the contents of pt_regs, in particular
+ * to modify arguments or even the syscall number itself.
+ *
+ * It's also possible that a tracer can choose to reject the system call. In
+ * that case this function will return an illegal syscall number, and will put
+ * an appropriate return value in regs->r3.
+ *
+ * Return: the (possibly changed) syscall number.
+ */
+long do_syscall_trace_enter(struct pt_regs *regs)
+{
+	user_exit();
+
+	/*
+	 * The tracer may decide to abort the syscall, if so tracehook
+	 * will return !0. Note that the tracer may also just change
+	 * regs->gpr[0] to an invalid syscall number, that is handled
+	 * below on the exit path.
+	 */
+	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
+	    tracehook_report_syscall_entry(regs))
+		goto skip;
+
+	/* Run seccomp after ptrace; allow it to set gpr[3]. */
+	if (do_seccomp(regs))
+		return -1;
+
+	/* Avoid trace and audit when syscall is invalid. */
+	if (regs->gpr[0] >= NR_syscalls)
+		goto skip;
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->gpr[0]);
+
+#ifdef CONFIG_PPC64
+	if (!is_32bit_task())
+		audit_syscall_entry(regs->gpr[0], regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+	else
+#endif
+		audit_syscall_entry(regs->gpr[0],
+				    regs->gpr[3] & 0xffffffff,
+				    regs->gpr[4] & 0xffffffff,
+				    regs->gpr[5] & 0xffffffff,
+				    regs->gpr[6] & 0xffffffff);
+
+	/* Return the possibly modified but valid syscall number */
+	return regs->gpr[0];
+
+skip:
+	/*
+	 * If we are aborting explicitly, or if the syscall number is
+	 * now invalid, set the return value to -ENOSYS.
+	 */
+	regs->gpr[3] = -ENOSYS;
+	return -1;
+}
+
+void do_syscall_trace_leave(struct pt_regs *regs)
+{
+	int step;
+
+	audit_syscall_exit(regs);
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->result);
+
+	step = test_thread_flag(TIF_SINGLESTEP);
+	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, step);
+
+	user_enter();
+}
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
new file mode 100644
index 000000000..f37eb53de
--- /dev/null
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -0,0 +1,318 @@
+/*
+ * ptrace for 32-bit processes running on a 64-bit kernel.
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/m68k/kernel/ptrace.c"
+ *  Copyright (C) 1994 by Hamish Macdonald
+ *  Taken from linux/kernel/ptrace.c and modified for M680x0.
+ *  linux/kernel/ptrace.c is by Ross Biro 1/23/92, edited by Linus Torvalds
+ *
+ * Modified by Cort Dougan (cort@hq.fsmlabs.com)
+ * and Paul Mackerras (paulus@samba.org).
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/regset.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/signal.h>
+#include <linux/compat.h>
+
+#include <linux/uaccess.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/switch_to.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/* Macros to workout the correct index for the FPR in the thread struct */
+#define FPRNUMBER(i) (((i) - PT_FPR0) >> 1)
+#define FPRHALF(i) (((i) - PT_FPR0) & 1)
+#define FPRINDEX(i) TS_FPRWIDTH * FPRNUMBER(i) * 2 + FPRHALF(i)
+
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+			compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	int ret;
+
+	switch (request) {
+	/*
+	 * Read 4 bytes of the other process' storage
+	 *  data is a pointer specifying where the user wants the
+	 *	4 bytes copied into
+	 *  addr is a pointer in the user's storage that contains an 8 byte
+	 *	address in the other process of the 4 bytes that is to be read
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 * when I and D space are separate, these will need to be fixed.
+	 */
+	case PPC_PTRACE_PEEKTEXT_3264:
+	case PPC_PTRACE_PEEKDATA_3264: {
+		u32 tmp;
+		int copied;
+		u32 __user * addrOthers;
+
+		ret = -EIO;
+
+		/* Get the addr in the other process that we want to read */
+		if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
+			break;
+
+		copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
+				sizeof(tmp), FOLL_FORCE);
+		if (copied != sizeof(tmp))
+			break;
+		ret = put_user(tmp, (u32 __user *)data);
+		break;
+	}
+
+	/* Read a register (specified by ADDR) out of the "user area" */
+	case PTRACE_PEEKUSR: {
+		int index;
+		unsigned long tmp;
+
+		ret = -EIO;
+		/* convert to index and check */
+		index = (unsigned long) addr >> 2;
+		if ((addr & 3) || (index > PT_FPSCR32))
+			break;
+
+		CHECK_FULL_REGS(child->thread.regs);
+		if (index < PT_FPR0) {
+			ret = ptrace_get_reg(child, index, &tmp);
+			if (ret)
+				break;
+		} else {
+			flush_fp_to_thread(child);
+			/*
+			 * the user space code considers the floating point
+			 * to be an array of unsigned int (32 bits) - the
+			 * index passed in is based on this assumption.
+			 */
+			tmp = ((unsigned int *)child->thread.fp_state.fpr)
+				[FPRINDEX(index)];
+		}
+		ret = put_user((unsigned int)tmp, (u32 __user *)data);
+		break;
+	}
+  
+	/*
+	 * Read 4 bytes out of the other process' pt_regs area
+	 *  data is a pointer specifying where the user wants the
+	 *	4 bytes copied into
+	 *  addr is the offset into the other process' pt_regs structure
+	 *	that is to be read
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 */
+	case PPC_PTRACE_PEEKUSR_3264: {
+		u32 index;
+		u32 reg32bits;
+		u64 tmp;
+		u32 numReg;
+		u32 part;
+
+		ret = -EIO;
+		/* Determine which register the user wants */
+		index = (u64)addr >> 2;
+		numReg = index / 2;
+		/* Determine which part of the register the user wants */
+		if (index % 2)
+			part = 1;  /* want the 2nd half of the register (right-most). */
+		else
+			part = 0;  /* want the 1st half of the register (left-most). */
+
+		/* Validate the input - check to see if address is on the wrong boundary
+		 * or beyond the end of the user area
+		 */
+		if ((addr & 3) || numReg > PT_FPSCR)
+			break;
+
+		CHECK_FULL_REGS(child->thread.regs);
+		if (numReg >= PT_FPR0) {
+			flush_fp_to_thread(child);
+			/* get 64 bit FPR */
+			tmp = child->thread.fp_state.fpr[numReg - PT_FPR0][0];
+		} else { /* register within PT_REGS struct */
+			unsigned long tmp2;
+			ret = ptrace_get_reg(child, numReg, &tmp2);
+			if (ret)
+				break;
+			tmp = tmp2;
+		} 
+		reg32bits = ((u32*)&tmp)[part];
+		ret = put_user(reg32bits, (u32 __user *)data);
+		break;
+	}
+
+	/*
+	 * Write 4 bytes into the other process' storage
+	 *  data is the 4 bytes that the user wants written
+	 *  addr is a pointer in the user's storage that contains an
+	 *	8 byte address in the other process where the 4 bytes
+	 *	that is to be written
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 * when I and D space are separate, these will need to be fixed.
+	 */
+	case PPC_PTRACE_POKETEXT_3264:
+	case PPC_PTRACE_POKEDATA_3264: {
+		u32 tmp = data;
+		u32 __user * addrOthers;
+
+		/* Get the addr in the other process that we want to write into */
+		ret = -EIO;
+		if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
+			break;
+		ret = 0;
+		if (ptrace_access_vm(child, (u64)addrOthers, &tmp,
+					sizeof(tmp),
+					FOLL_FORCE | FOLL_WRITE) == sizeof(tmp))
+			break;
+		ret = -EIO;
+		break;
+	}
+
+	/* write the word at location addr in the USER area */
+	case PTRACE_POKEUSR: {
+		unsigned long index;
+
+		ret = -EIO;
+		/* convert to index and check */
+		index = (unsigned long) addr >> 2;
+		if ((addr & 3) || (index > PT_FPSCR32))
+			break;
+
+		CHECK_FULL_REGS(child->thread.regs);
+		if (index < PT_FPR0) {
+			ret = ptrace_put_reg(child, index, data);
+		} else {
+			flush_fp_to_thread(child);
+			/*
+			 * the user space code considers the floating point
+			 * to be an array of unsigned int (32 bits) - the
+			 * index passed in is based on this assumption.
+			 */
+			((unsigned int *)child->thread.fp_state.fpr)
+				[FPRINDEX(index)] = data;
+			ret = 0;
+		}
+		break;
+	}
+
+	/*
+	 * Write 4 bytes into the other process' pt_regs area
+	 *  data is the 4 bytes that the user wants written
+	 *  addr is the offset into the other process' pt_regs structure
+	 *	that is to be written into
+	 * (this is run in a 32-bit process looking at a 64-bit process)
+	 */
+	case PPC_PTRACE_POKEUSR_3264: {
+		u32 index;
+		u32 numReg;
+
+		ret = -EIO;
+		/* Determine which register the user wants */
+		index = (u64)addr >> 2;
+		numReg = index / 2;
+
+		/*
+		 * Validate the input - check to see if address is on the
+		 * wrong boundary or beyond the end of the user area
+		 */
+		if ((addr & 3) || (numReg > PT_FPSCR))
+			break;
+		CHECK_FULL_REGS(child->thread.regs);
+		if (numReg < PT_FPR0) {
+			unsigned long freg;
+			ret = ptrace_get_reg(child, numReg, &freg);
+			if (ret)
+				break;
+			if (index % 2)
+				freg = (freg & ~0xfffffffful) | (data & 0xfffffffful);
+			else
+				freg = (freg & 0xfffffffful) | (data << 32);
+			ret = ptrace_put_reg(child, numReg, freg);
+		} else {
+			u64 *tmp;
+			flush_fp_to_thread(child);
+			/* get 64 bit FPR ... */
+			tmp = &child->thread.fp_state.fpr[numReg - PT_FPR0][0];
+			/* ... write the 32 bit part we want */
+			((u32 *)tmp)[index % 2] = data;
+			ret = 0;
+		}
+		break;
+	}
+
+	case PTRACE_GET_DEBUGREG: {
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+		unsigned long dabr_fake;
+#endif
+		ret = -EINVAL;
+		/* We only support one DABR and no IABRS at the moment */
+		if (addr > 0)
+			break;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+		ret = put_user(child->thread.debug.dac1, (u32 __user *)data);
+#else
+		dabr_fake = (
+			(child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) |
+			(child->thread.hw_brk.type & HW_BRK_TYPE_DABR));
+		ret = put_user(dabr_fake, (u32 __user *)data);
+#endif
+		break;
+	}
+
+	case PTRACE_GETREGS:	/* Get all pt_regs from the child. */
+		return copy_regset_to_user(
+			child, task_user_regset_view(current), 0,
+			0, PT_REGS_COUNT * sizeof(compat_long_t),
+			compat_ptr(data));
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(
+			child, task_user_regset_view(current), 0,
+			0, PT_REGS_COUNT * sizeof(compat_long_t),
+			compat_ptr(data));
+
+	case PTRACE_GETFPREGS:
+	case PTRACE_SETFPREGS:
+	case PTRACE_GETVRREGS:
+	case PTRACE_SETVRREGS:
+	case PTRACE_GETVSRREGS:
+	case PTRACE_SETVSRREGS:
+	case PTRACE_GETREGS64:
+	case PTRACE_SETREGS64:
+	case PTRACE_KILL:
+	case PTRACE_SINGLESTEP:
+	case PTRACE_DETACH:
+	case PTRACE_SET_DEBUGREG:
+	case PTRACE_SYSCALL:
+	case PTRACE_CONT:
+	case PPC_PTRACE_GETHWDBGINFO:
+	case PPC_PTRACE_SETHWDEBUG:
+	case PPC_PTRACE_DELHWDEBUG:
+		ret = arch_ptrace(child, request, addr, data);
+		break;
+
+	default:
+		ret = compat_ptrace_request(child, request, addr, data);
+		break;
+	}
+
+	return ret;
+}
diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
new file mode 100644
index 000000000..f366fedb0
--- /dev/null
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -0,0 +1,209 @@
+/*
+ * Code to process dynamic relocations for PPC32.
+ *
+ * Copyrights (C) IBM Corporation, 2011.
+ *	Author: Suzuki Poulose <suzuki@in.ibm.com>
+ *
+ *  - Based on ppc64 code - reloc_64.S
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+
+/* Dynamic section table entry tags */
+DT_RELA = 7			/* Tag for Elf32_Rela section */
+DT_RELASZ = 8			/* Size of the Rela relocs */
+DT_RELAENT = 9			/* Size of one Rela reloc entry */
+
+STN_UNDEF = 0			/* Undefined symbol index */
+STB_LOCAL = 0			/* Local binding for the symbol */
+
+R_PPC_ADDR16_LO = 4		/* Lower half of (S+A) */
+R_PPC_ADDR16_HI = 5		/* Upper half of (S+A) */
+R_PPC_ADDR16_HA = 6		/* High Adjusted (S+A) */
+R_PPC_RELATIVE = 22
+
+/*
+ * r3 = desired final address
+ */
+
+_GLOBAL(relocate)
+
+	mflr	r0		/* Save our LR */
+	bl	0f		/* Find our current runtime address */
+0:	mflr	r12		/* Make it accessible */
+	mtlr	r0
+
+	lwz	r11, (p_dyn - 0b)(r12)
+	add	r11, r11, r12	/* runtime address of .dynamic section */
+	lwz	r9, (p_rela - 0b)(r12)
+	add	r9, r9, r12	/* runtime address of .rela.dyn section */
+	lwz	r10, (p_st - 0b)(r12)
+	add	r10, r10, r12	/* runtime address of _stext section */
+	lwz	r13, (p_sym - 0b)(r12)
+	add	r13, r13, r12	/* runtime address of .dynsym section */
+
+	/*
+	 * Scan the dynamic section for RELA, RELASZ entries
+	 */
+	li	r6, 0
+	li	r7, 0
+	li	r8, 0
+1:	lwz	r5, 0(r11)	/* ELF_Dyn.d_tag */
+	cmpwi	r5, 0		/* End of ELF_Dyn[] */
+	beq	eodyn
+	cmpwi	r5, DT_RELA
+	bne	relasz
+	lwz	r7, 4(r11)	/* r7 = rela.link */
+	b	skip
+relasz:
+	cmpwi	r5, DT_RELASZ
+	bne	relaent
+	lwz	r8, 4(r11)	/* r8 = Total Rela relocs size */
+	b	skip
+relaent:
+	cmpwi	r5, DT_RELAENT
+	bne	skip
+	lwz	r6, 4(r11)	/* r6 = Size of one Rela reloc */
+skip:
+	addi	r11, r11, 8
+	b	1b
+eodyn:				/* End of Dyn Table scan */
+
+	/* Check if we have found all the entries */
+	cmpwi	r7, 0
+	beq	done
+	cmpwi	r8, 0
+	beq	done
+	cmpwi	r6, 0
+	beq	done
+
+
+	/*
+	 * Work out the current offset from the link time address of .rela
+	 * section.
+	 *  cur_offset[r7] = rela.run[r9] - rela.link [r7]
+	 *  _stext.link[r12] = _stext.run[r10] - cur_offset[r7]
+	 *  final_offset[r3] = _stext.final[r3] - _stext.link[r12]
+	 */
+	subf	r7, r7, r9	/* cur_offset */
+	subf	r12, r7, r10
+	subf	r3, r12, r3	/* final_offset */
+
+	subf	r8, r6, r8	/* relaz -= relaent */
+	/*
+	 * Scan through the .rela table and process each entry
+	 * r9	- points to the current .rela table entry
+	 * r13	- points to the symbol table
+	 */
+
+	/*
+	 * Check if we have a relocation based on symbol
+	 * r5 will hold the value of the symbol.
+	 */
+applyrela:
+	lwz	r4, 4(r9)		/* r4 = rela.r_info */
+	srwi	r5, r4, 8		/* ELF32_R_SYM(r_info) */
+	cmpwi	r5, STN_UNDEF	/* sym == STN_UNDEF ? */
+	beq	get_type	/* value = 0 */
+	/* Find the value of the symbol at index(r5) */
+	slwi	r5, r5, 4		/* r5 = r5 * sizeof(Elf32_Sym) */
+	add	r12, r13, r5	/* r12 = &__dyn_sym[Index] */
+
+	/*
+	 * GNU ld has a bug, where dynamic relocs based on
+	 * STB_LOCAL symbols, the value should be assumed
+	 * to be zero. - Alan Modra
+	 */
+	/* XXX: Do we need to check if we are using GNU ld ? */
+	lbz	r5, 12(r12)	/* r5 = dyn_sym[Index].st_info */
+	extrwi	r5, r5, 4, 24	/* r5 = ELF32_ST_BIND(r5) */
+	cmpwi	r5, STB_LOCAL	/* st_value = 0, ld bug */
+	beq	get_type	/* We have r5 = 0 */
+	lwz	r5, 4(r12)	/* r5 = __dyn_sym[Index].st_value */
+
+get_type:
+	/* Load the relocation type to r4 */
+	extrwi	r4, r4, 8, 24	/* r4 = ELF32_R_TYPE(r_info) = ((char*)r4)[3] */
+
+	/* R_PPC_RELATIVE */
+	cmpwi	r4, R_PPC_RELATIVE
+	bne	hi16
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3	/* final addend */
+	stwx	r0, r4, r7	/* memory[r4+r7]) = (u32)r0 */
+	b	nxtrela		/* continue */
+
+	/* R_PPC_ADDR16_HI */
+hi16:
+	cmpwi	r4, R_PPC_ADDR16_HI
+	bne	ha16
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3
+	add	r0, r0, r5	/* r0 = (S+A+Offset) */
+	extrwi	r0, r0, 16, 0	/* r0 = (r0 >> 16) */
+	b	store_half
+
+	/* R_PPC_ADDR16_HA */
+ha16:
+	cmpwi	r4, R_PPC_ADDR16_HA
+	bne	lo16
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3
+	add	r0, r0, r5	/* r0 = (S+A+Offset) */
+	extrwi	r5, r0, 1, 16	/* Extract bit 16 */
+	extrwi	r0, r0, 16, 0	/* r0 = (r0 >> 16) */
+	add	r0, r0, r5	/* Add it to r0 */
+	b	store_half
+
+	/* R_PPC_ADDR16_LO */
+lo16:
+	cmpwi	r4, R_PPC_ADDR16_LO
+	bne	unknown_type
+	lwz	r4, 0(r9)	/* r_offset */
+	lwz	r0, 8(r9)	/* r_addend */
+	add	r0, r0, r3
+	add	r0, r0, r5	/* r0 = (S+A+Offset) */
+	extrwi	r0, r0, 16, 16	/* r0 &= 0xffff */
+	/* Fall through to */
+
+	/* Store half word */
+store_half:
+	sthx	r0, r4, r7	/* memory[r4+r7] = (u16)r0 */
+
+nxtrela:
+	/*
+	 * We have to flush the modified instructions to the
+	 * main storage from the d-cache. And also, invalidate the
+	 * cached instructions in i-cache which has been modified.
+	 *
+	 * We delay the sync / isync operation till the end, since
+	 * we won't be executing the modified instructions until
+	 * we return from here.
+	 */
+	dcbst	r4,r7
+	sync			/* Ensure the data is flushed before icbi */
+	icbi	r4,r7
+unknown_type:
+	cmpwi	r8, 0		/* relasz = 0 ? */
+	ble	done
+	add	r9, r9, r6	/* move to next entry in the .rela table */
+	subf	r8, r6, r8	/* relasz -= relaent */
+	b	applyrela
+
+done:
+	sync			/* Wait for the flush to finish */
+	isync			/* Discard prefetched instructions */
+	blr
+
+p_dyn:		.long	__dynamic_start - 0b
+p_rela:		.long	__rela_dyn_start - 0b
+p_sym:		.long	__dynamic_symtab - 0b
+p_st:		.long	_stext - 0b
diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S
new file mode 100644
index 000000000..e8cfc69f5
--- /dev/null
+++ b/arch/powerpc/kernel/reloc_64.S
@@ -0,0 +1,88 @@
+/*
+ * Code to process dynamic relocations in the kernel.
+ *
+ * Copyright 2008 Paul Mackerras, IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+
+RELA = 7
+RELACOUNT = 0x6ffffff9
+R_PPC64_RELATIVE = 22
+
+/*
+ * r3 = desired final address of kernel
+ */
+_GLOBAL(relocate)
+	mflr	r0
+	bcl	20,31,$+4
+0:	mflr	r12		/* r12 has runtime addr of label 0 */
+	mtlr	r0
+	ld	r11,(p_dyn - 0b)(r12)
+	add	r11,r11,r12	/* r11 has runtime addr of .dynamic section */
+	ld	r9,(p_rela - 0b)(r12)
+	add	r9,r9,r12	/* r9 has runtime addr of .rela.dyn section */
+	ld	r10,(p_st - 0b)(r12)
+	add	r10,r10,r12	/* r10 has runtime addr of _stext */
+
+	/*
+	 * Scan the dynamic section for the RELA and RELACOUNT entries.
+	 */
+	li	r7,0
+	li	r8,0
+1:	ld	r6,0(r11)	/* get tag */
+	cmpdi	r6,0
+	beq	4f		/* end of list */
+	cmpdi	r6,RELA
+	bne	2f
+	ld	r7,8(r11)	/* get RELA pointer in r7 */
+	b	3f
+2:	addis	r6,r6,(-RELACOUNT)@ha
+	cmpdi	r6,RELACOUNT@l
+	bne	3f
+	ld	r8,8(r11)	/* get RELACOUNT value in r8 */
+3:	addi	r11,r11,16
+	b	1b
+4:	cmpdi	r7,0		/* check we have both RELA and RELACOUNT */
+	cmpdi	cr1,r8,0
+	beq	6f
+	beq	cr1,6f
+
+	/*
+	 * Work out linktime address of _stext and hence the
+	 * relocation offset to be applied.
+	 * cur_offset [r7] = rela.run [r9] - rela.link [r7]
+	 * _stext.link [r10] = _stext.run [r10] - cur_offset [r7]
+	 * final_offset [r3] = _stext.final [r3] - _stext.link [r10]
+	 */
+	subf	r7,r7,r9	/* cur_offset */
+	subf	r10,r7,r10
+	subf	r3,r10,r3	/* final_offset */
+
+	/*
+	 * Run through the list of relocations and process the
+	 * R_PPC64_RELATIVE ones.
+	 */
+	mtctr	r8
+5:	ld	r0,8(9)		/* ELF64_R_TYPE(reloc->r_info) */
+	cmpdi	r0,R_PPC64_RELATIVE
+	bne	6f
+	ld	r6,0(r9)	/* reloc->r_offset */
+	ld	r0,16(r9)	/* reloc->r_addend */
+	add	r0,r0,r3
+	stdx	r0,r7,r6
+	addi	r9,r9,24
+	bdnz	5b
+
+6:	blr
+
+.balign 8
+p_dyn:	.8byte	__dynamic_start - 0b
+p_rela:	.8byte	__rela_dyn_start - 0b
+p_st:	.8byte	_stext - 0b
+
diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c
new file mode 100644
index 000000000..487dcd8da
--- /dev/null
+++ b/arch/powerpc/kernel/rtas-proc.c
@@ -0,0 +1,765 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *   Copyright (C) 2000 Tilmann Bitterberg
+ *   (tilmann@bitterberg.de)
+ *
+ *   RTAS (Runtime Abstraction Services) stuff
+ *   Intention is to provide a clean user interface
+ *   to use the RTAS.
+ *
+ *   TODO:
+ *   Split off a header file and maybe move it to a different
+ *   location. Write Documentation on what the /proc/rtas/ entries
+ *   actually do.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/bitops.h>
+#include <linux/rtc.h>
+
+#include <linux/uaccess.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/machdep.h> /* for ppc_md */
+#include <asm/time.h>
+
+/* Token for Sensors */
+#define KEY_SWITCH		0x0001
+#define ENCLOSURE_SWITCH	0x0002
+#define THERMAL_SENSOR		0x0003
+#define LID_STATUS		0x0004
+#define POWER_SOURCE		0x0005
+#define BATTERY_VOLTAGE		0x0006
+#define BATTERY_REMAINING	0x0007
+#define BATTERY_PERCENTAGE	0x0008
+#define EPOW_SENSOR		0x0009
+#define BATTERY_CYCLESTATE	0x000a
+#define BATTERY_CHARGING	0x000b
+
+/* IBM specific sensors */
+#define IBM_SURVEILLANCE	0x2328 /* 9000 */
+#define IBM_FANRPM		0x2329 /* 9001 */
+#define IBM_VOLTAGE		0x232a /* 9002 */
+#define IBM_DRCONNECTOR		0x232b /* 9003 */
+#define IBM_POWERSUPPLY		0x232c /* 9004 */
+
+/* Status return values */
+#define SENSOR_CRITICAL_HIGH	13
+#define SENSOR_WARNING_HIGH	12
+#define SENSOR_NORMAL		11
+#define SENSOR_WARNING_LOW	10
+#define SENSOR_CRITICAL_LOW	 9
+#define SENSOR_SUCCESS		 0
+#define SENSOR_HW_ERROR		-1
+#define SENSOR_BUSY		-2
+#define SENSOR_NOT_EXIST	-3
+#define SENSOR_DR_ENTITY	-9000
+
+/* Location Codes */
+#define LOC_SCSI_DEV_ADDR	'A'
+#define LOC_SCSI_DEV_LOC	'B'
+#define LOC_CPU			'C'
+#define LOC_DISKETTE		'D'
+#define LOC_ETHERNET		'E'
+#define LOC_FAN			'F'
+#define LOC_GRAPHICS		'G'
+/* reserved / not used		'H' */
+#define LOC_IO_ADAPTER		'I'
+/* reserved / not used		'J' */
+#define LOC_KEYBOARD		'K'
+#define LOC_LCD			'L'
+#define LOC_MEMORY		'M'
+#define LOC_NV_MEMORY		'N'
+#define LOC_MOUSE		'O'
+#define LOC_PLANAR		'P'
+#define LOC_OTHER_IO		'Q'
+#define LOC_PARALLEL		'R'
+#define LOC_SERIAL		'S'
+#define LOC_DEAD_RING		'T'
+#define LOC_RACKMOUNTED		'U' /* for _u_nit is rack mounted */
+#define LOC_VOLTAGE		'V'
+#define LOC_SWITCH_ADAPTER	'W'
+#define LOC_OTHER		'X'
+#define LOC_FIRMWARE		'Y'
+#define LOC_SCSI		'Z'
+
+/* Tokens for indicators */
+#define TONE_FREQUENCY		0x0001 /* 0 - 1000 (HZ)*/
+#define TONE_VOLUME		0x0002 /* 0 - 100 (%) */
+#define SYSTEM_POWER_STATE	0x0003 
+#define WARNING_LIGHT		0x0004
+#define DISK_ACTIVITY_LIGHT	0x0005
+#define HEX_DISPLAY_UNIT	0x0006
+#define BATTERY_WARNING_TIME	0x0007
+#define CONDITION_CYCLE_REQUEST	0x0008
+#define SURVEILLANCE_INDICATOR	0x2328 /* 9000 */
+#define DR_ACTION		0x2329 /* 9001 */
+#define DR_INDICATOR		0x232a /* 9002 */
+/* 9003 - 9004: Vendor specific */
+/* 9006 - 9999: Vendor specific */
+
+/* other */
+#define MAX_SENSORS		 17  /* I only know of 17 sensors */    
+#define MAX_LINELENGTH          256
+#define SENSOR_PREFIX		"ibm,sensor-"
+#define cel_to_fahr(x)		((x*9/5)+32)
+
+struct individual_sensor {
+	unsigned int token;
+	unsigned int quant;
+};
+
+struct rtas_sensors {
+        struct individual_sensor sensor[MAX_SENSORS];
+	unsigned int quant;
+};
+
+/* Globals */
+static struct rtas_sensors sensors;
+static struct device_node *rtas_node = NULL;
+static unsigned long power_on_time = 0; /* Save the time the user set */
+static char progress_led[MAX_LINELENGTH];
+
+static unsigned long rtas_tone_frequency = 1000;
+static unsigned long rtas_tone_volume = 0;
+
+/* ****************************************************************** */
+/* Declarations */
+static int ppc_rtas_sensors_show(struct seq_file *m, void *v);
+static int ppc_rtas_clock_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_clock_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_progress_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_progress_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_poweron_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_poweron_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+
+static ssize_t ppc_rtas_tone_freq_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_tone_freq_show(struct seq_file *m, void *v);
+static ssize_t ppc_rtas_tone_volume_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos);
+static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v);
+static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v);
+
+static int poweron_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_poweron_show, NULL);
+}
+
+static const struct file_operations ppc_rtas_poweron_operations = {
+	.open		= poweron_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_poweron_write,
+	.release	= single_release,
+};
+
+static int progress_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_progress_show, NULL);
+}
+
+static const struct file_operations ppc_rtas_progress_operations = {
+	.open		= progress_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_progress_write,
+	.release	= single_release,
+};
+
+static int clock_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_clock_show, NULL);
+}
+
+static const struct file_operations ppc_rtas_clock_operations = {
+	.open		= clock_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_clock_write,
+	.release	= single_release,
+};
+
+static int tone_freq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_tone_freq_show, NULL);
+}
+
+static const struct file_operations ppc_rtas_tone_freq_operations = {
+	.open		= tone_freq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_tone_freq_write,
+	.release	= single_release,
+};
+
+static int tone_volume_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ppc_rtas_tone_volume_show, NULL);
+}
+
+static const struct file_operations ppc_rtas_tone_volume_operations = {
+	.open		= tone_volume_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.write		= ppc_rtas_tone_volume_write,
+	.release	= single_release,
+};
+
+static int ppc_rtas_find_all_sensors(void);
+static void ppc_rtas_process_sensor(struct seq_file *m,
+	struct individual_sensor *s, int state, int error, const char *loc);
+static char *ppc_rtas_process_error(int error);
+static void get_location_code(struct seq_file *m,
+	struct individual_sensor *s, const char *loc);
+static void check_location_string(struct seq_file *m, const char *c);
+static void check_location(struct seq_file *m, const char *c);
+
+static int __init proc_rtas_init(void)
+{
+	if (!machine_is(pseries))
+		return -ENODEV;
+
+	rtas_node = of_find_node_by_name(NULL, "rtas");
+	if (rtas_node == NULL)
+		return -ENODEV;
+
+	proc_create("powerpc/rtas/progress", 0644, NULL,
+		    &ppc_rtas_progress_operations);
+	proc_create("powerpc/rtas/clock", 0644, NULL,
+		    &ppc_rtas_clock_operations);
+	proc_create("powerpc/rtas/poweron", 0644, NULL,
+		    &ppc_rtas_poweron_operations);
+	proc_create_single("powerpc/rtas/sensors", 0444, NULL,
+			ppc_rtas_sensors_show);
+	proc_create("powerpc/rtas/frequency", 0644, NULL,
+		    &ppc_rtas_tone_freq_operations);
+	proc_create("powerpc/rtas/volume", 0644, NULL,
+		    &ppc_rtas_tone_volume_operations);
+	proc_create_single("powerpc/rtas/rmo_buffer", 0400, NULL,
+			ppc_rtas_rmo_buf_show);
+	return 0;
+}
+
+__initcall(proc_rtas_init);
+
+static int parse_number(const char __user *p, size_t count, u64 *val)
+{
+	char buf[40];
+	char *end;
+
+	if (count > 39)
+		return -EINVAL;
+
+	if (copy_from_user(buf, p, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	*val = simple_strtoull(buf, &end, 10);
+	if (*end && *end != '\n')
+		return -EINVAL;
+
+	return 0;
+}
+
+/* ****************************************************************** */
+/* POWER-ON-TIME                                                      */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_poweron_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct rtc_time tm;
+	time64_t nowtime;
+	int error = parse_number(buf, count, &nowtime);
+	if (error)
+		return error;
+
+	power_on_time = nowtime; /* save the time */
+
+	rtc_time64_to_tm(nowtime, &tm);
+
+	error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, 
+			tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+			tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */);
+	if (error)
+		printk(KERN_WARNING "error: setting poweron time returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_poweron_show(struct seq_file *m, void *v)
+{
+	if (power_on_time == 0)
+		seq_printf(m, "Power on time not set\n");
+	else
+		seq_printf(m, "%lu\n",power_on_time);
+	return 0;
+}
+
+/* ****************************************************************** */
+/* PROGRESS                                                           */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_progress_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long hex;
+
+	if (count >= MAX_LINELENGTH)
+		count = MAX_LINELENGTH -1;
+	if (copy_from_user(progress_led, buf, count)) { /* save the string */
+		return -EFAULT;
+	}
+	progress_led[count] = 0;
+
+	/* Lets see if the user passed hexdigits */
+	hex = simple_strtoul(progress_led, NULL, 10);
+
+	rtas_progress ((char *)progress_led, hex);
+	return count;
+
+	/* clear the line */
+	/* rtas_progress("                   ", 0xffff);*/
+}
+/* ****************************************************************** */
+static int ppc_rtas_progress_show(struct seq_file *m, void *v)
+{
+	if (progress_led[0])
+		seq_printf(m, "%s\n", progress_led);
+	return 0;
+}
+
+/* ****************************************************************** */
+/* CLOCK                                                              */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_clock_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct rtc_time tm;
+	time64_t nowtime;
+	int error = parse_number(buf, count, &nowtime);
+	if (error)
+		return error;
+
+	rtc_time64_to_tm(nowtime, &tm);
+	error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, 
+			tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+			tm.tm_hour, tm.tm_min, tm.tm_sec, 0);
+	if (error)
+		printk(KERN_WARNING "error: setting the clock returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_clock_show(struct seq_file *m, void *v)
+{
+	int ret[8];
+	int error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+
+	if (error) {
+		printk(KERN_WARNING "error: reading the clock returned: %s\n", 
+				ppc_rtas_process_error(error));
+		seq_printf(m, "0");
+	} else { 
+		unsigned int year, mon, day, hour, min, sec;
+		year = ret[0]; mon  = ret[1]; day  = ret[2];
+		hour = ret[3]; min  = ret[4]; sec  = ret[5];
+		seq_printf(m, "%lld\n",
+				mktime64(year, mon, day, hour, min, sec));
+	}
+	return 0;
+}
+
+/* ****************************************************************** */
+/* SENSOR STUFF                                                       */
+/* ****************************************************************** */
+static int ppc_rtas_sensors_show(struct seq_file *m, void *v)
+{
+	int i,j;
+	int state, error;
+	int get_sensor_state = rtas_token("get-sensor-state");
+
+	seq_printf(m, "RTAS (RunTime Abstraction Services) Sensor Information\n");
+	seq_printf(m, "Sensor\t\tValue\t\tCondition\tLocation\n");
+	seq_printf(m, "********************************************************\n");
+
+	if (ppc_rtas_find_all_sensors() != 0) {
+		seq_printf(m, "\nNo sensors are available\n");
+		return 0;
+	}
+
+	for (i=0; i<sensors.quant; i++) {
+		struct individual_sensor *p = &sensors.sensor[i];
+		char rstr[64];
+		const char *loc;
+		int llen, offs;
+
+		sprintf (rstr, SENSOR_PREFIX"%04d", p->token);
+		loc = of_get_property(rtas_node, rstr, &llen);
+
+		/* A sensor may have multiple instances */
+		for (j = 0, offs = 0; j <= p->quant; j++) {
+			error =	rtas_call(get_sensor_state, 2, 2, &state, 
+				  	  p->token, j);
+
+			ppc_rtas_process_sensor(m, p, state, error, loc);
+			seq_putc(m, '\n');
+			if (loc) {
+				offs += strlen(loc) + 1;
+				loc += strlen(loc) + 1;
+				if (offs >= llen)
+					loc = NULL;
+			}
+		}
+	}
+	return 0;
+}
+
+/* ****************************************************************** */
+
+static int ppc_rtas_find_all_sensors(void)
+{
+	const unsigned int *utmp;
+	int len, i;
+
+	utmp = of_get_property(rtas_node, "rtas-sensors", &len);
+	if (utmp == NULL) {
+		printk (KERN_ERR "error: could not get rtas-sensors\n");
+		return 1;
+	}
+
+	sensors.quant = len / 8;      /* int + int */
+
+	for (i=0; i<sensors.quant; i++) {
+		sensors.sensor[i].token = *utmp++;
+		sensors.sensor[i].quant = *utmp++;
+	}
+	return 0;
+}
+
+/* ****************************************************************** */
+/*
+ * Builds a string of what rtas returned
+ */
+static char *ppc_rtas_process_error(int error)
+{
+	switch (error) {
+		case SENSOR_CRITICAL_HIGH:
+			return "(critical high)";
+		case SENSOR_WARNING_HIGH:
+			return "(warning high)";
+		case SENSOR_NORMAL:
+			return "(normal)";
+		case SENSOR_WARNING_LOW:
+			return "(warning low)";
+		case SENSOR_CRITICAL_LOW:
+			return "(critical low)";
+		case SENSOR_SUCCESS:
+			return "(read ok)";
+		case SENSOR_HW_ERROR:
+			return "(hardware error)";
+		case SENSOR_BUSY:
+			return "(busy)";
+		case SENSOR_NOT_EXIST:
+			return "(non existent)";
+		case SENSOR_DR_ENTITY:
+			return "(dr entity removed)";
+		default:
+			return "(UNKNOWN)";
+	}
+}
+
+/* ****************************************************************** */
+/*
+ * Builds a string out of what the sensor said
+ */
+
+static void ppc_rtas_process_sensor(struct seq_file *m,
+	struct individual_sensor *s, int state, int error, const char *loc)
+{
+	/* Defined return vales */
+	const char * key_switch[]        = { "Off\t", "Normal\t", "Secure\t", 
+						"Maintenance" };
+	const char * enclosure_switch[]  = { "Closed", "Open" };
+	const char * lid_status[]        = { " ", "Open", "Closed" };
+	const char * power_source[]      = { "AC\t", "Battery", 
+		  				"AC & Battery" };
+	const char * battery_remaining[] = { "Very Low", "Low", "Mid", "High" };
+	const char * epow_sensor[]       = { 
+		"EPOW Reset", "Cooling warning", "Power warning",
+		"System shutdown", "System halt", "EPOW main enclosure",
+		"EPOW power off" };
+	const char * battery_cyclestate[]  = { "None", "In progress", 
+						"Requested" };
+	const char * battery_charging[]    = { "Charging", "Discharging",
+						"No current flow" };
+	const char * ibm_drconnector[]     = { "Empty", "Present", "Unusable", 
+						"Exchange" };
+
+	int have_strings = 0;
+	int num_states = 0;
+	int temperature = 0;
+	int unknown = 0;
+
+	/* What kind of sensor do we have here? */
+	
+	switch (s->token) {
+		case KEY_SWITCH:
+			seq_printf(m, "Key switch:\t");
+			num_states = sizeof(key_switch) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", key_switch[state]);
+				have_strings = 1;
+			}
+			break;
+		case ENCLOSURE_SWITCH:
+			seq_printf(m, "Enclosure switch:\t");
+			num_states = sizeof(enclosure_switch) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						enclosure_switch[state]);
+				have_strings = 1;
+			}
+			break;
+		case THERMAL_SENSOR:
+			seq_printf(m, "Temp. (C/F):\t");
+			temperature = 1;
+			break;
+		case LID_STATUS:
+			seq_printf(m, "Lid status:\t");
+			num_states = sizeof(lid_status) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", lid_status[state]);
+				have_strings = 1;
+			}
+			break;
+		case POWER_SOURCE:
+			seq_printf(m, "Power source:\t");
+			num_states = sizeof(power_source) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						power_source[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_VOLTAGE:
+			seq_printf(m, "Battery voltage:\t");
+			break;
+		case BATTERY_REMAINING:
+			seq_printf(m, "Battery remaining:\t");
+			num_states = sizeof(battery_remaining) / sizeof(char *);
+			if (state < num_states)
+			{
+				seq_printf(m, "%s\t", 
+						battery_remaining[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_PERCENTAGE:
+			seq_printf(m, "Battery percentage:\t");
+			break;
+		case EPOW_SENSOR:
+			seq_printf(m, "EPOW Sensor:\t");
+			num_states = sizeof(epow_sensor) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", epow_sensor[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_CYCLESTATE:
+			seq_printf(m, "Battery cyclestate:\t");
+			num_states = sizeof(battery_cyclestate) / 
+				     	sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						battery_cyclestate[state]);
+				have_strings = 1;
+			}
+			break;
+		case BATTERY_CHARGING:
+			seq_printf(m, "Battery Charging:\t");
+			num_states = sizeof(battery_charging) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						battery_charging[state]);
+				have_strings = 1;
+			}
+			break;
+		case IBM_SURVEILLANCE:
+			seq_printf(m, "Surveillance:\t");
+			break;
+		case IBM_FANRPM:
+			seq_printf(m, "Fan (rpm):\t");
+			break;
+		case IBM_VOLTAGE:
+			seq_printf(m, "Voltage (mv):\t");
+			break;
+		case IBM_DRCONNECTOR:
+			seq_printf(m, "DR connector:\t");
+			num_states = sizeof(ibm_drconnector) / sizeof(char *);
+			if (state < num_states) {
+				seq_printf(m, "%s\t", 
+						ibm_drconnector[state]);
+				have_strings = 1;
+			}
+			break;
+		case IBM_POWERSUPPLY:
+			seq_printf(m, "Powersupply:\t");
+			break;
+		default:
+			seq_printf(m,  "Unknown sensor (type %d), ignoring it\n",
+					s->token);
+			unknown = 1;
+			have_strings = 1;
+			break;
+	}
+	if (have_strings == 0) {
+		if (temperature) {
+			seq_printf(m, "%4d /%4d\t", state, cel_to_fahr(state));
+		} else
+			seq_printf(m, "%10d\t", state);
+	}
+	if (unknown == 0) {
+		seq_printf(m, "%s\t", ppc_rtas_process_error(error));
+		get_location_code(m, s, loc);
+	}
+}
+
+/* ****************************************************************** */
+
+static void check_location(struct seq_file *m, const char *c)
+{
+	switch (c[0]) {
+		case LOC_PLANAR:
+			seq_printf(m, "Planar #%c", c[1]);
+			break;
+		case LOC_CPU:
+			seq_printf(m, "CPU #%c", c[1]);
+			break;
+		case LOC_FAN:
+			seq_printf(m, "Fan #%c", c[1]);
+			break;
+		case LOC_RACKMOUNTED:
+			seq_printf(m, "Rack #%c", c[1]);
+			break;
+		case LOC_VOLTAGE:
+			seq_printf(m, "Voltage #%c", c[1]);
+			break;
+		case LOC_LCD:
+			seq_printf(m, "LCD #%c", c[1]);
+			break;
+		case '.':
+			seq_printf(m, "- %c", c[1]);
+			break;
+		default:
+			seq_printf(m, "Unknown location");
+			break;
+	}
+}
+
+
+/* ****************************************************************** */
+/* 
+ * Format: 
+ * ${LETTER}${NUMBER}[[-/]${LETTER}${NUMBER} [ ... ] ]
+ * the '.' may be an abbreviation
+ */
+static void check_location_string(struct seq_file *m, const char *c)
+{
+	while (*c) {
+		if (isalpha(*c) || *c == '.')
+			check_location(m, c);
+		else if (*c == '/' || *c == '-')
+			seq_printf(m, " at ");
+		c++;
+	}
+}
+
+
+/* ****************************************************************** */
+
+static void get_location_code(struct seq_file *m, struct individual_sensor *s,
+		const char *loc)
+{
+	if (!loc || !*loc) {
+		seq_printf(m, "---");/* does not have a location */
+	} else {
+		check_location_string(m, loc);
+	}
+	seq_putc(m, ' ');
+}
+/* ****************************************************************** */
+/* INDICATORS - Tone Frequency                                        */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_tone_freq_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	u64 freq;
+	int error = parse_number(buf, count, &freq);
+	if (error)
+		return error;
+
+	rtas_tone_frequency = freq; /* save it for later */
+	error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL,
+			TONE_FREQUENCY, 0, freq);
+	if (error)
+		printk(KERN_WARNING "error: setting tone frequency returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_tone_freq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%lu\n", rtas_tone_frequency);
+	return 0;
+}
+/* ****************************************************************** */
+/* INDICATORS - Tone Volume                                           */
+/* ****************************************************************** */
+static ssize_t ppc_rtas_tone_volume_write(struct file *file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	u64 volume;
+	int error = parse_number(buf, count, &volume);
+	if (error)
+		return error;
+
+	if (volume > 100)
+		volume = 100;
+	
+        rtas_tone_volume = volume; /* save it for later */
+	error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL,
+			TONE_VOLUME, 0, volume);
+	if (error)
+		printk(KERN_WARNING "error: setting tone volume returned: %s\n", 
+				ppc_rtas_process_error(error));
+	return count;
+}
+/* ****************************************************************** */
+static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%lu\n", rtas_tone_volume);
+	return 0;
+}
+
+#define RMO_READ_BUF_MAX 30
+
+/* RTAS Userspace access */
+static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_RMOBUF_MAX);
+	return 0;
+}
diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c
new file mode 100644
index 000000000..a28239b8b
--- /dev/null
+++ b/arch/powerpc/kernel/rtas-rtc.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/rtc.h>
+#include <linux/delay.h>
+#include <linux/ratelimit.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/time.h>
+
+
+#define MAX_RTC_WAIT 5000	/* 5 sec */
+#define RTAS_CLOCK_BUSY (-2)
+time64_t __init rtas_get_boot_time(void)
+{
+	int ret[8];
+	int error;
+	unsigned int wait_time;
+	u64 max_wait_tb;
+
+	max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
+	do {
+		error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+
+		wait_time = rtas_busy_delay_time(error);
+		if (wait_time) {
+			/* This is boot time so we spin. */
+			udelay(wait_time*1000);
+		}
+	} while (wait_time && (get_tb() < max_wait_tb));
+
+	if (error != 0) {
+		printk_ratelimited(KERN_WARNING
+				   "error: reading the clock failed (%d)\n",
+				   error);
+		return 0;
+	}
+
+	return mktime64(ret[0], ret[1], ret[2], ret[3], ret[4], ret[5]);
+}
+
+/* NOTE: get_rtc_time will get an error if executed in interrupt context
+ * and if a delay is needed to read the clock.  In this case we just
+ * silently return without updating rtc_tm.
+ */
+void rtas_get_rtc_time(struct rtc_time *rtc_tm)
+{
+        int ret[8];
+	int error;
+	unsigned int wait_time;
+	u64 max_wait_tb;
+
+	max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
+	do {
+		error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+
+		wait_time = rtas_busy_delay_time(error);
+		if (wait_time) {
+			if (in_interrupt()) {
+				memset(rtc_tm, 0, sizeof(struct rtc_time));
+				printk_ratelimited(KERN_WARNING
+						   "error: reading clock "
+						   "would delay interrupt\n");
+				return;	/* delay not allowed */
+			}
+			msleep(wait_time);
+		}
+	} while (wait_time && (get_tb() < max_wait_tb));
+
+	if (error != 0) {
+		printk_ratelimited(KERN_WARNING
+				   "error: reading the clock failed (%d)\n",
+				   error);
+		return;
+        }
+
+	rtc_tm->tm_sec = ret[5];
+	rtc_tm->tm_min = ret[4];
+	rtc_tm->tm_hour = ret[3];
+	rtc_tm->tm_mday = ret[2];
+	rtc_tm->tm_mon = ret[1] - 1;
+	rtc_tm->tm_year = ret[0] - 1900;
+}
+
+int rtas_set_rtc_time(struct rtc_time *tm)
+{
+	int error, wait_time;
+	u64 max_wait_tb;
+
+	max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
+	do {
+	        error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL,
+				  tm->tm_year + 1900, tm->tm_mon + 1,
+				  tm->tm_mday, tm->tm_hour, tm->tm_min,
+				  tm->tm_sec, 0);
+
+		wait_time = rtas_busy_delay_time(error);
+		if (wait_time) {
+			if (in_interrupt())
+				return 1;	/* probably decrementer */
+			msleep(wait_time);
+		}
+	} while (wait_time && (get_tb() < max_wait_tb));
+
+	if (error != 0)
+		printk_ratelimited(KERN_WARNING
+				   "error: setting the clock failed (%d)\n",
+				   error);
+
+        return 0;
+}
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
new file mode 100644
index 000000000..362c20c8c
--- /dev/null
+++ b/arch/powerpc/kernel/rtas.c
@@ -0,0 +1,1419 @@
+/*
+ *
+ * Procedures for interfacing to the RTAS on CHRP machines.
+ *
+ * Peter Bergner, IBM	March 2001.
+ * Copyright (C) 2001 IBM.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <stdarg.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/delay.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/completion.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
+
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/page.h>
+#include <asm/param.h>
+#include <asm/delay.h>
+#include <linux/uaccess.h>
+#include <asm/udbg.h>
+#include <asm/syscalls.h>
+#include <asm/smp.h>
+#include <linux/atomic.h>
+#include <asm/time.h>
+#include <asm/mmu.h>
+#include <asm/topology.h>
+
+/* This is here deliberately so it's only used in this file */
+void enter_rtas(unsigned long);
+
+struct rtas_t rtas = {
+	.lock = __ARCH_SPIN_LOCK_UNLOCKED
+};
+EXPORT_SYMBOL(rtas);
+
+DEFINE_SPINLOCK(rtas_data_buf_lock);
+EXPORT_SYMBOL(rtas_data_buf_lock);
+
+char rtas_data_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
+EXPORT_SYMBOL(rtas_data_buf);
+
+unsigned long rtas_rmo_buf;
+
+/*
+ * If non-NULL, this gets called when the kernel terminates.
+ * This is done like this so rtas_flash can be a module.
+ */
+void (*rtas_flash_term_hook)(int);
+EXPORT_SYMBOL(rtas_flash_term_hook);
+
+/* RTAS use home made raw locking instead of spin_lock_irqsave
+ * because those can be called from within really nasty contexts
+ * such as having the timebase stopped which would lockup with
+ * normal locks and spinlock debugging enabled
+ */
+static unsigned long lock_rtas(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	arch_spin_lock(&rtas.lock);
+	return flags;
+}
+
+static void unlock_rtas(unsigned long flags)
+{
+	arch_spin_unlock(&rtas.lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+
+/*
+ * call_rtas_display_status and call_rtas_display_status_delay
+ * are designed only for very early low-level debugging, which
+ * is why the token is hard-coded to 10.
+ */
+static void call_rtas_display_status(unsigned char c)
+{
+	unsigned long s;
+
+	if (!rtas.base)
+		return;
+
+	s = lock_rtas();
+	rtas_call_unlocked(&rtas.args, 10, 1, 1, NULL, c);
+	unlock_rtas(s);
+}
+
+static void call_rtas_display_status_delay(char c)
+{
+	static int pending_newline = 0;  /* did last write end with unprinted newline? */
+	static int width = 16;
+
+	if (c == '\n') {	
+		while (width-- > 0)
+			call_rtas_display_status(' ');
+		width = 16;
+		mdelay(500);
+		pending_newline = 1;
+	} else {
+		if (pending_newline) {
+			call_rtas_display_status('\r');
+			call_rtas_display_status('\n');
+		} 
+		pending_newline = 0;
+		if (width--) {
+			call_rtas_display_status(c);
+			udelay(10000);
+		}
+	}
+}
+
+void __init udbg_init_rtas_panel(void)
+{
+	udbg_putc = call_rtas_display_status_delay;
+}
+
+#ifdef CONFIG_UDBG_RTAS_CONSOLE
+
+/* If you think you're dying before early_init_dt_scan_rtas() does its
+ * work, you can hard code the token values for your firmware here and
+ * hardcode rtas.base/entry etc.
+ */
+static unsigned int rtas_putchar_token = RTAS_UNKNOWN_SERVICE;
+static unsigned int rtas_getchar_token = RTAS_UNKNOWN_SERVICE;
+
+static void udbg_rtascon_putc(char c)
+{
+	int tries;
+
+	if (!rtas.base)
+		return;
+
+	/* Add CRs before LFs */
+	if (c == '\n')
+		udbg_rtascon_putc('\r');
+
+	/* if there is more than one character to be displayed, wait a bit */
+	for (tries = 0; tries < 16; tries++) {
+		if (rtas_call(rtas_putchar_token, 1, 1, NULL, c) == 0)
+			break;
+		udelay(1000);
+	}
+}
+
+static int udbg_rtascon_getc_poll(void)
+{
+	int c;
+
+	if (!rtas.base)
+		return -1;
+
+	if (rtas_call(rtas_getchar_token, 0, 2, &c))
+		return -1;
+
+	return c;
+}
+
+static int udbg_rtascon_getc(void)
+{
+	int c;
+
+	while ((c = udbg_rtascon_getc_poll()) == -1)
+		;
+
+	return c;
+}
+
+
+void __init udbg_init_rtas_console(void)
+{
+	udbg_putc = udbg_rtascon_putc;
+	udbg_getc = udbg_rtascon_getc;
+	udbg_getc_poll = udbg_rtascon_getc_poll;
+}
+#endif /* CONFIG_UDBG_RTAS_CONSOLE */
+
+void rtas_progress(char *s, unsigned short hex)
+{
+	struct device_node *root;
+	int width;
+	const __be32 *p;
+	char *os;
+	static int display_character, set_indicator;
+	static int display_width, display_lines, form_feed;
+	static const int *row_width;
+	static DEFINE_SPINLOCK(progress_lock);
+	static int current_line;
+	static int pending_newline = 0;  /* did last write end with unprinted newline? */
+
+	if (!rtas.base)
+		return;
+
+	if (display_width == 0) {
+		display_width = 0x10;
+		if ((root = of_find_node_by_path("/rtas"))) {
+			if ((p = of_get_property(root,
+					"ibm,display-line-length", NULL)))
+				display_width = be32_to_cpu(*p);
+			if ((p = of_get_property(root,
+					"ibm,form-feed", NULL)))
+				form_feed = be32_to_cpu(*p);
+			if ((p = of_get_property(root,
+					"ibm,display-number-of-lines", NULL)))
+				display_lines = be32_to_cpu(*p);
+			row_width = of_get_property(root,
+					"ibm,display-truncation-length", NULL);
+			of_node_put(root);
+		}
+		display_character = rtas_token("display-character");
+		set_indicator = rtas_token("set-indicator");
+	}
+
+	if (display_character == RTAS_UNKNOWN_SERVICE) {
+		/* use hex display if available */
+		if (set_indicator != RTAS_UNKNOWN_SERVICE)
+			rtas_call(set_indicator, 3, 1, NULL, 6, 0, hex);
+		return;
+	}
+
+	spin_lock(&progress_lock);
+
+	/*
+	 * Last write ended with newline, but we didn't print it since
+	 * it would just clear the bottom line of output. Print it now
+	 * instead.
+	 *
+	 * If no newline is pending and form feed is supported, clear the
+	 * display with a form feed; otherwise, print a CR to start output
+	 * at the beginning of the line.
+	 */
+	if (pending_newline) {
+		rtas_call(display_character, 1, 1, NULL, '\r');
+		rtas_call(display_character, 1, 1, NULL, '\n');
+		pending_newline = 0;
+	} else {
+		current_line = 0;
+		if (form_feed)
+			rtas_call(display_character, 1, 1, NULL,
+				  (char)form_feed);
+		else
+			rtas_call(display_character, 1, 1, NULL, '\r');
+	}
+ 
+	if (row_width)
+		width = row_width[current_line];
+	else
+		width = display_width;
+	os = s;
+	while (*os) {
+		if (*os == '\n' || *os == '\r') {
+			/* If newline is the last character, save it
+			 * until next call to avoid bumping up the
+			 * display output.
+			 */
+			if (*os == '\n' && !os[1]) {
+				pending_newline = 1;
+				current_line++;
+				if (current_line > display_lines-1)
+					current_line = display_lines-1;
+				spin_unlock(&progress_lock);
+				return;
+			}
+ 
+			/* RTAS wants CR-LF, not just LF */
+ 
+			if (*os == '\n') {
+				rtas_call(display_character, 1, 1, NULL, '\r');
+				rtas_call(display_character, 1, 1, NULL, '\n');
+			} else {
+				/* CR might be used to re-draw a line, so we'll
+				 * leave it alone and not add LF.
+				 */
+				rtas_call(display_character, 1, 1, NULL, *os);
+			}
+ 
+			if (row_width)
+				width = row_width[current_line];
+			else
+				width = display_width;
+		} else {
+			width--;
+			rtas_call(display_character, 1, 1, NULL, *os);
+		}
+ 
+		os++;
+ 
+		/* if we overwrite the screen length */
+		if (width <= 0)
+			while ((*os != 0) && (*os != '\n') && (*os != '\r'))
+				os++;
+	}
+ 
+	spin_unlock(&progress_lock);
+}
+EXPORT_SYMBOL(rtas_progress);		/* needed by rtas_flash module */
+
+int rtas_token(const char *service)
+{
+	const __be32 *tokp;
+	if (rtas.dev == NULL)
+		return RTAS_UNKNOWN_SERVICE;
+	tokp = of_get_property(rtas.dev, service, NULL);
+	return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE;
+}
+EXPORT_SYMBOL(rtas_token);
+
+int rtas_service_present(const char *service)
+{
+	return rtas_token(service) != RTAS_UNKNOWN_SERVICE;
+}
+EXPORT_SYMBOL(rtas_service_present);
+
+#ifdef CONFIG_RTAS_ERROR_LOGGING
+/*
+ * Return the firmware-specified size of the error log buffer
+ *  for all rtas calls that require an error buffer argument.
+ *  This includes 'check-exception' and 'rtas-last-error'.
+ */
+int rtas_get_error_log_max(void)
+{
+	static int rtas_error_log_max;
+	if (rtas_error_log_max)
+		return rtas_error_log_max;
+
+	rtas_error_log_max = rtas_token ("rtas-error-log-max");
+	if ((rtas_error_log_max == RTAS_UNKNOWN_SERVICE) ||
+	    (rtas_error_log_max > RTAS_ERROR_LOG_MAX)) {
+		printk (KERN_WARNING "RTAS: bad log buffer size %d\n",
+			rtas_error_log_max);
+		rtas_error_log_max = RTAS_ERROR_LOG_MAX;
+	}
+	return rtas_error_log_max;
+}
+EXPORT_SYMBOL(rtas_get_error_log_max);
+
+
+static char rtas_err_buf[RTAS_ERROR_LOG_MAX];
+static int rtas_last_error_token;
+
+/** Return a copy of the detailed error text associated with the
+ *  most recent failed call to rtas.  Because the error text
+ *  might go stale if there are any other intervening rtas calls,
+ *  this routine must be called atomically with whatever produced
+ *  the error (i.e. with rtas.lock still held from the previous call).
+ */
+static char *__fetch_rtas_last_error(char *altbuf)
+{
+	struct rtas_args err_args, save_args;
+	u32 bufsz;
+	char *buf = NULL;
+
+	if (rtas_last_error_token == -1)
+		return NULL;
+
+	bufsz = rtas_get_error_log_max();
+
+	err_args.token = cpu_to_be32(rtas_last_error_token);
+	err_args.nargs = cpu_to_be32(2);
+	err_args.nret = cpu_to_be32(1);
+	err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf));
+	err_args.args[1] = cpu_to_be32(bufsz);
+	err_args.args[2] = 0;
+
+	save_args = rtas.args;
+	rtas.args = err_args;
+
+	enter_rtas(__pa(&rtas.args));
+
+	err_args = rtas.args;
+	rtas.args = save_args;
+
+	/* Log the error in the unlikely case that there was one. */
+	if (unlikely(err_args.args[2] == 0)) {
+		if (altbuf) {
+			buf = altbuf;
+		} else {
+			buf = rtas_err_buf;
+			if (slab_is_available())
+				buf = kmalloc(RTAS_ERROR_LOG_MAX, GFP_ATOMIC);
+		}
+		if (buf)
+			memcpy(buf, rtas_err_buf, RTAS_ERROR_LOG_MAX);
+	}
+
+	return buf;
+}
+
+#define get_errorlog_buffer()	kmalloc(RTAS_ERROR_LOG_MAX, GFP_KERNEL)
+
+#else /* CONFIG_RTAS_ERROR_LOGGING */
+#define __fetch_rtas_last_error(x)	NULL
+#define get_errorlog_buffer()		NULL
+#endif
+
+
+static void
+va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
+		      va_list list)
+{
+	int i;
+
+	args->token = cpu_to_be32(token);
+	args->nargs = cpu_to_be32(nargs);
+	args->nret  = cpu_to_be32(nret);
+	args->rets  = &(args->args[nargs]);
+
+	for (i = 0; i < nargs; ++i)
+		args->args[i] = cpu_to_be32(va_arg(list, __u32));
+
+	for (i = 0; i < nret; ++i)
+		args->rets[i] = 0;
+
+	enter_rtas(__pa(args));
+}
+
+void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...)
+{
+	va_list list;
+
+	va_start(list, nret);
+	va_rtas_call_unlocked(args, token, nargs, nret, list);
+	va_end(list);
+}
+
+int rtas_call(int token, int nargs, int nret, int *outputs, ...)
+{
+	va_list list;
+	int i;
+	unsigned long s;
+	struct rtas_args *rtas_args;
+	char *buff_copy = NULL;
+	int ret;
+
+	if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE)
+		return -1;
+
+	s = lock_rtas();
+
+	/* We use the global rtas args buffer */
+	rtas_args = &rtas.args;
+
+	va_start(list, outputs);
+	va_rtas_call_unlocked(rtas_args, token, nargs, nret, list);
+	va_end(list);
+
+	/* A -1 return code indicates that the last command couldn't
+	   be completed due to a hardware error. */
+	if (be32_to_cpu(rtas_args->rets[0]) == -1)
+		buff_copy = __fetch_rtas_last_error(NULL);
+
+	if (nret > 1 && outputs != NULL)
+		for (i = 0; i < nret-1; ++i)
+			outputs[i] = be32_to_cpu(rtas_args->rets[i+1]);
+	ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0;
+
+	unlock_rtas(s);
+
+	if (buff_copy) {
+		log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0);
+		if (slab_is_available())
+			kfree(buff_copy);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rtas_call);
+
+/* For RTAS_BUSY (-2), delay for 1 millisecond.  For an extended busy status
+ * code of 990n, perform the hinted delay of 10^n (last digit) milliseconds.
+ */
+unsigned int rtas_busy_delay_time(int status)
+{
+	int order;
+	unsigned int ms = 0;
+
+	if (status == RTAS_BUSY) {
+		ms = 1;
+	} else if (status >= RTAS_EXTENDED_DELAY_MIN &&
+		   status <= RTAS_EXTENDED_DELAY_MAX) {
+		order = status - RTAS_EXTENDED_DELAY_MIN;
+		for (ms = 1; order > 0; order--)
+			ms *= 10;
+	}
+
+	return ms;
+}
+EXPORT_SYMBOL(rtas_busy_delay_time);
+
+/* For an RTAS busy status code, perform the hinted delay. */
+unsigned int rtas_busy_delay(int status)
+{
+	unsigned int ms;
+
+	might_sleep();
+	ms = rtas_busy_delay_time(status);
+	if (ms && need_resched())
+		msleep(ms);
+
+	return ms;
+}
+EXPORT_SYMBOL(rtas_busy_delay);
+
+static int rtas_error_rc(int rtas_rc)
+{
+	int rc;
+
+	switch (rtas_rc) {
+		case -1: 		/* Hardware Error */
+			rc = -EIO;
+			break;
+		case -3:		/* Bad indicator/domain/etc */
+			rc = -EINVAL;
+			break;
+		case -9000:		/* Isolation error */
+			rc = -EFAULT;
+			break;
+		case -9001:		/* Outstanding TCE/PTE */
+			rc = -EEXIST;
+			break;
+		case -9002:		/* No usable slot */
+			rc = -ENODEV;
+			break;
+		default:
+			printk(KERN_ERR "%s: unexpected RTAS error %d\n",
+					__func__, rtas_rc);
+			rc = -ERANGE;
+			break;
+	}
+	return rc;
+}
+
+int rtas_get_power_level(int powerdomain, int *level)
+{
+	int token = rtas_token("get-power-level");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	while ((rc = rtas_call(token, 1, 2, level, powerdomain)) == RTAS_BUSY)
+		udelay(1);
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+EXPORT_SYMBOL(rtas_get_power_level);
+
+int rtas_set_power_level(int powerdomain, int level, int *setlevel)
+{
+	int token = rtas_token("set-power-level");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	do {
+		rc = rtas_call(token, 2, 2, setlevel, powerdomain, level);
+	} while (rtas_busy_delay(rc));
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+EXPORT_SYMBOL(rtas_set_power_level);
+
+int rtas_get_sensor(int sensor, int index, int *state)
+{
+	int token = rtas_token("get-sensor-state");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	do {
+		rc = rtas_call(token, 2, 2, state, sensor, index);
+	} while (rtas_busy_delay(rc));
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+EXPORT_SYMBOL(rtas_get_sensor);
+
+int rtas_get_sensor_fast(int sensor, int index, int *state)
+{
+	int token = rtas_token("get-sensor-state");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	rc = rtas_call(token, 2, 2, state, sensor, index);
+	WARN_ON(rc == RTAS_BUSY || (rc >= RTAS_EXTENDED_DELAY_MIN &&
+				    rc <= RTAS_EXTENDED_DELAY_MAX));
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+
+bool rtas_indicator_present(int token, int *maxindex)
+{
+	int proplen, count, i;
+	const struct indicator_elem {
+		__be32 token;
+		__be32 maxindex;
+	} *indicators;
+
+	indicators = of_get_property(rtas.dev, "rtas-indicators", &proplen);
+	if (!indicators)
+		return false;
+
+	count = proplen / sizeof(struct indicator_elem);
+
+	for (i = 0; i < count; i++) {
+		if (__be32_to_cpu(indicators[i].token) != token)
+			continue;
+		if (maxindex)
+			*maxindex = __be32_to_cpu(indicators[i].maxindex);
+		return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(rtas_indicator_present);
+
+int rtas_set_indicator(int indicator, int index, int new_value)
+{
+	int token = rtas_token("set-indicator");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	do {
+		rc = rtas_call(token, 3, 1, NULL, indicator, index, new_value);
+	} while (rtas_busy_delay(rc));
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+	return rc;
+}
+EXPORT_SYMBOL(rtas_set_indicator);
+
+/*
+ * Ignoring RTAS extended delay
+ */
+int rtas_set_indicator_fast(int indicator, int index, int new_value)
+{
+	int rc;
+	int token = rtas_token("set-indicator");
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	rc = rtas_call(token, 3, 1, NULL, indicator, index, new_value);
+
+	WARN_ON(rc == RTAS_BUSY || (rc >= RTAS_EXTENDED_DELAY_MIN &&
+				    rc <= RTAS_EXTENDED_DELAY_MAX));
+
+	if (rc < 0)
+		return rtas_error_rc(rc);
+
+	return rc;
+}
+
+void __noreturn rtas_restart(char *cmd)
+{
+	if (rtas_flash_term_hook)
+		rtas_flash_term_hook(SYS_RESTART);
+	printk("RTAS system-reboot returned %d\n",
+	       rtas_call(rtas_token("system-reboot"), 0, 1, NULL));
+	for (;;);
+}
+
+void rtas_power_off(void)
+{
+	if (rtas_flash_term_hook)
+		rtas_flash_term_hook(SYS_POWER_OFF);
+	/* allow power on only with power button press */
+	printk("RTAS power-off returned %d\n",
+	       rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1));
+	for (;;);
+}
+
+void __noreturn rtas_halt(void)
+{
+	if (rtas_flash_term_hook)
+		rtas_flash_term_hook(SYS_HALT);
+	/* allow power on only with power button press */
+	printk("RTAS power-off returned %d\n",
+	       rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1));
+	for (;;);
+}
+
+/* Must be in the RMO region, so we place it here */
+static char rtas_os_term_buf[2048];
+
+void rtas_os_term(char *str)
+{
+	int status;
+
+	/*
+	 * Firmware with the ibm,extended-os-term property is guaranteed
+	 * to always return from an ibm,os-term call. Earlier versions without
+	 * this property may terminate the partition which we want to avoid
+	 * since it interferes with panic_timeout.
+	 */
+	if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") ||
+	    RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term"))
+		return;
+
+	snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str);
+
+	do {
+		status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL,
+				   __pa(rtas_os_term_buf));
+	} while (rtas_busy_delay(status));
+
+	if (status != 0)
+		printk(KERN_EMERG "ibm,os-term call failed %d\n", status);
+}
+
+static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE;
+#ifdef CONFIG_PPC_PSERIES
+static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_when_done)
+{
+	u16 slb_size = mmu_slb_size;
+	int rc = H_MULTI_THREADS_ACTIVE;
+	int cpu;
+
+	slb_set_size(SLB_MIN_SIZE);
+	printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id());
+
+	while (rc == H_MULTI_THREADS_ACTIVE && !atomic_read(&data->done) &&
+	       !atomic_read(&data->error))
+		rc = rtas_call(data->token, 0, 1, NULL);
+
+	if (rc || atomic_read(&data->error)) {
+		printk(KERN_DEBUG "ibm,suspend-me returned %d\n", rc);
+		slb_set_size(slb_size);
+	}
+
+	if (atomic_read(&data->error))
+		rc = atomic_read(&data->error);
+
+	atomic_set(&data->error, rc);
+	pSeries_coalesce_init();
+
+	if (wake_when_done) {
+		atomic_set(&data->done, 1);
+
+		for_each_online_cpu(cpu)
+			plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+	}
+
+	if (atomic_dec_return(&data->working) == 0)
+		complete(data->complete);
+
+	return rc;
+}
+
+int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data)
+{
+	atomic_inc(&data->working);
+	return __rtas_suspend_last_cpu(data, 0);
+}
+
+static int __rtas_suspend_cpu(struct rtas_suspend_me_data *data, int wake_when_done)
+{
+	long rc = H_SUCCESS;
+	unsigned long msr_save;
+	int cpu;
+
+	atomic_inc(&data->working);
+
+	/* really need to ensure MSR.EE is off for H_JOIN */
+	msr_save = mfmsr();
+	mtmsr(msr_save & ~(MSR_EE));
+
+	while (rc == H_SUCCESS && !atomic_read(&data->done) && !atomic_read(&data->error))
+		rc = plpar_hcall_norets(H_JOIN);
+
+	mtmsr(msr_save);
+
+	if (rc == H_SUCCESS) {
+		/* This cpu was prodded and the suspend is complete. */
+		goto out;
+	} else if (rc == H_CONTINUE) {
+		/* All other cpus are in H_JOIN, this cpu does
+		 * the suspend.
+		 */
+		return __rtas_suspend_last_cpu(data, wake_when_done);
+	} else {
+		printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n",
+		       smp_processor_id(), rc);
+		atomic_set(&data->error, rc);
+	}
+
+	if (wake_when_done) {
+		atomic_set(&data->done, 1);
+
+		/* This cpu did the suspend or got an error; in either case,
+		 * we need to prod all other other cpus out of join state.
+		 * Extra prods are harmless.
+		 */
+		for_each_online_cpu(cpu)
+			plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+	}
+out:
+	if (atomic_dec_return(&data->working) == 0)
+		complete(data->complete);
+	return rc;
+}
+
+int rtas_suspend_cpu(struct rtas_suspend_me_data *data)
+{
+	return __rtas_suspend_cpu(data, 0);
+}
+
+static void rtas_percpu_suspend_me(void *info)
+{
+	__rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1);
+}
+
+enum rtas_cpu_state {
+	DOWN,
+	UP,
+};
+
+#ifndef CONFIG_SMP
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+				cpumask_var_t cpus)
+{
+	if (!cpumask_empty(cpus)) {
+		cpumask_clear(cpus);
+		return -EINVAL;
+	} else
+		return 0;
+}
+#else
+/* On return cpumask will be altered to indicate CPUs changed.
+ * CPUs with states changed will be set in the mask,
+ * CPUs with status unchanged will be unset in the mask. */
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+				cpumask_var_t cpus)
+{
+	int cpu;
+	int cpuret = 0;
+	int ret = 0;
+
+	if (cpumask_empty(cpus))
+		return 0;
+
+	for_each_cpu(cpu, cpus) {
+		struct device *dev = get_cpu_device(cpu);
+
+		switch (state) {
+		case DOWN:
+			cpuret = device_offline(dev);
+			break;
+		case UP:
+			cpuret = device_online(dev);
+			break;
+		}
+		if (cpuret < 0) {
+			pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
+					__func__,
+					((state == UP) ? "up" : "down"),
+					cpu, cpuret);
+			if (!ret)
+				ret = cpuret;
+			if (state == UP) {
+				/* clear bits for unchanged cpus, return */
+				cpumask_shift_right(cpus, cpus, cpu);
+				cpumask_shift_left(cpus, cpus, cpu);
+				break;
+			} else {
+				/* clear bit for unchanged cpu, continue */
+				cpumask_clear_cpu(cpu, cpus);
+			}
+		}
+	}
+
+	return ret;
+}
+#endif
+
+int rtas_online_cpus_mask(cpumask_var_t cpus)
+{
+	int ret;
+
+	ret = rtas_cpu_state_change_mask(UP, cpus);
+
+	if (ret) {
+		cpumask_var_t tmp_mask;
+
+		if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
+			return ret;
+
+		/* Use tmp_mask to preserve cpus mask from first failure */
+		cpumask_copy(tmp_mask, cpus);
+		rtas_offline_cpus_mask(tmp_mask);
+		free_cpumask_var(tmp_mask);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rtas_online_cpus_mask);
+
+int rtas_offline_cpus_mask(cpumask_var_t cpus)
+{
+	return rtas_cpu_state_change_mask(DOWN, cpus);
+}
+EXPORT_SYMBOL(rtas_offline_cpus_mask);
+
+int rtas_ibm_suspend_me(u64 handle)
+{
+	long state;
+	long rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	struct rtas_suspend_me_data data;
+	DECLARE_COMPLETION_ONSTACK(done);
+	cpumask_var_t offline_mask;
+	int cpuret;
+
+	if (!rtas_service_present("ibm,suspend-me"))
+		return -ENOSYS;
+
+	/* Make sure the state is valid */
+	rc = plpar_hcall(H_VASI_STATE, retbuf, handle);
+
+	state = retbuf[0];
+
+	if (rc) {
+		printk(KERN_ERR "rtas_ibm_suspend_me: vasi_state returned %ld\n",rc);
+		return rc;
+	} else if (state == H_VASI_ENABLED) {
+		return -EAGAIN;
+	} else if (state != H_VASI_SUSPENDING) {
+		printk(KERN_ERR "rtas_ibm_suspend_me: vasi_state returned state %ld\n",
+		       state);
+		return -EIO;
+	}
+
+	if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	atomic_set(&data.working, 0);
+	atomic_set(&data.done, 0);
+	atomic_set(&data.error, 0);
+	data.token = rtas_token("ibm,suspend-me");
+	data.complete = &done;
+
+	lock_device_hotplug();
+
+	/* All present CPUs must be online */
+	cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
+	cpuret = rtas_online_cpus_mask(offline_mask);
+	if (cpuret) {
+		pr_err("%s: Could not bring present CPUs online.\n", __func__);
+		atomic_set(&data.error, cpuret);
+		goto out;
+	}
+
+	cpu_hotplug_disable();
+	stop_topology_update();
+
+	/* Call function on all CPUs.  One of us will make the
+	 * rtas call
+	 */
+	if (on_each_cpu(rtas_percpu_suspend_me, &data, 0))
+		atomic_set(&data.error, -EINVAL);
+
+	wait_for_completion(&done);
+
+	if (atomic_read(&data.error) != 0)
+		printk(KERN_ERR "Error doing global join\n");
+
+	start_topology_update();
+	cpu_hotplug_enable();
+
+	/* Take down CPUs not online prior to suspend */
+	cpuret = rtas_offline_cpus_mask(offline_mask);
+	if (cpuret)
+		pr_warn("%s: Could not restore CPUs to offline state.\n",
+				__func__);
+
+out:
+	unlock_device_hotplug();
+	free_cpumask_var(offline_mask);
+	return atomic_read(&data.error);
+}
+#else /* CONFIG_PPC_PSERIES */
+int rtas_ibm_suspend_me(u64 handle)
+{
+	return -ENOSYS;
+}
+#endif
+
+/**
+ * Find a specific pseries error log in an RTAS extended event log.
+ * @log: RTAS error/event log
+ * @section_id: two character section identifier
+ *
+ * Returns a pointer to the specified errorlog or NULL if not found.
+ */
+struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
+					      uint16_t section_id)
+{
+	struct rtas_ext_event_log_v6 *ext_log =
+		(struct rtas_ext_event_log_v6 *)log->buffer;
+	struct pseries_errorlog *sect;
+	unsigned char *p, *log_end;
+	uint32_t ext_log_length = rtas_error_extended_log_length(log);
+	uint8_t log_format = rtas_ext_event_log_format(ext_log);
+	uint32_t company_id = rtas_ext_event_company_id(ext_log);
+
+	/* Check that we understand the format */
+	if (ext_log_length < sizeof(struct rtas_ext_event_log_v6) ||
+	    log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG ||
+	    company_id != RTAS_V6EXT_COMPANY_ID_IBM)
+		return NULL;
+
+	log_end = log->buffer + ext_log_length;
+	p = ext_log->vendor_log;
+
+	while (p < log_end) {
+		sect = (struct pseries_errorlog *)p;
+		if (pseries_errorlog_id(sect) == section_id)
+			return sect;
+		p += pseries_errorlog_length(sect);
+	}
+
+	return NULL;
+}
+
+#ifdef CONFIG_PPC_RTAS_FILTER
+
+/*
+ * The sys_rtas syscall, as originally designed, allows root to pass
+ * arbitrary physical addresses to RTAS calls. A number of RTAS calls
+ * can be abused to write to arbitrary memory and do other things that
+ * are potentially harmful to system integrity, and thus should only
+ * be used inside the kernel and not exposed to userspace.
+ *
+ * All known legitimate users of the sys_rtas syscall will only ever
+ * pass addresses that fall within the RMO buffer, and use a known
+ * subset of RTAS calls.
+ *
+ * Accordingly, we filter RTAS requests to check that the call is
+ * permitted, and that provided pointers fall within the RMO buffer.
+ * The rtas_filters list contains an entry for each permitted call,
+ * with the indexes of the parameters which are expected to contain
+ * addresses and sizes of buffers allocated inside the RMO buffer.
+ */
+struct rtas_filter {
+	const char *name;
+	int token;
+	/* Indexes into the args buffer, -1 if not used */
+	int buf_idx1;
+	int size_idx1;
+	int buf_idx2;
+	int size_idx2;
+
+	int fixed_size;
+};
+
+static struct rtas_filter rtas_filters[] __ro_after_init = {
+	{ "ibm,activate-firmware", -1, -1, -1, -1, -1 },
+	{ "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 },	/* Special cased */
+	{ "display-character", -1, -1, -1, -1, -1 },
+	{ "ibm,display-message", -1, 0, -1, -1, -1 },
+	{ "ibm,errinjct", -1, 2, -1, -1, -1, 1024 },
+	{ "ibm,close-errinjct", -1, -1, -1, -1, -1 },
+	{ "ibm,open-errinjct", -1, -1, -1, -1, -1 },
+	{ "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 },
+	{ "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 },
+	{ "ibm,get-indices", -1, 2, 3, -1, -1 },
+	{ "get-power-level", -1, -1, -1, -1, -1 },
+	{ "get-sensor-state", -1, -1, -1, -1, -1 },
+	{ "ibm,get-system-parameter", -1, 1, 2, -1, -1 },
+	{ "get-time-of-day", -1, -1, -1, -1, -1 },
+	{ "ibm,get-vpd", -1, 0, -1, 1, 2 },
+	{ "ibm,lpar-perftools", -1, 2, 3, -1, -1 },
+	{ "ibm,platform-dump", -1, 4, 5, -1, -1 },
+	{ "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 },
+	{ "ibm,scan-log-dump", -1, 0, 1, -1, -1 },
+	{ "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 },
+	{ "ibm,set-eeh-option", -1, -1, -1, -1, -1 },
+	{ "set-indicator", -1, -1, -1, -1, -1 },
+	{ "set-power-level", -1, -1, -1, -1, -1 },
+	{ "set-time-for-power-on", -1, -1, -1, -1, -1 },
+	{ "ibm,set-system-parameter", -1, 1, -1, -1, -1 },
+	{ "set-time-of-day", -1, -1, -1, -1, -1 },
+	{ "ibm,suspend-me", -1, -1, -1, -1, -1 },
+	{ "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 },
+	{ "ibm,update-properties", -1, 0, -1, -1, -1, 4096 },
+	{ "ibm,physical-attestation", -1, 0, 1, -1, -1 },
+};
+
+static bool in_rmo_buf(u32 base, u32 end)
+{
+	return base >= rtas_rmo_buf &&
+		base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) &&
+		base <= end &&
+		end >= rtas_rmo_buf &&
+		end < (rtas_rmo_buf + RTAS_RMOBUF_MAX);
+}
+
+static bool block_rtas_call(int token, int nargs,
+			    struct rtas_args *args)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
+		struct rtas_filter *f = &rtas_filters[i];
+		u32 base, size, end;
+
+		if (token != f->token)
+			continue;
+
+		if (f->buf_idx1 != -1) {
+			base = be32_to_cpu(args->args[f->buf_idx1]);
+			if (f->size_idx1 != -1)
+				size = be32_to_cpu(args->args[f->size_idx1]);
+			else if (f->fixed_size)
+				size = f->fixed_size;
+			else
+				size = 1;
+
+			end = base + size - 1;
+			if (!in_rmo_buf(base, end))
+				goto err;
+		}
+
+		if (f->buf_idx2 != -1) {
+			base = be32_to_cpu(args->args[f->buf_idx2]);
+			if (f->size_idx2 != -1)
+				size = be32_to_cpu(args->args[f->size_idx2]);
+			else if (f->fixed_size)
+				size = f->fixed_size;
+			else
+				size = 1;
+			end = base + size - 1;
+
+			/*
+			 * Special case for ibm,configure-connector where the
+			 * address can be 0
+			 */
+			if (!strcmp(f->name, "ibm,configure-connector") &&
+			    base == 0)
+				return false;
+
+			if (!in_rmo_buf(base, end))
+				goto err;
+		}
+
+		return false;
+	}
+
+err:
+	pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
+	pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
+			   token, nargs, current->comm);
+	return true;
+}
+
+#else
+
+static bool block_rtas_call(int token, int nargs,
+			    struct rtas_args *args)
+{
+	return false;
+}
+
+#endif /* CONFIG_PPC_RTAS_FILTER */
+
+/* We assume to be passed big endian arguments */
+SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
+{
+	struct rtas_args args;
+	unsigned long flags;
+	char *buff_copy, *errbuf = NULL;
+	int nargs, nret, token;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!rtas.entry)
+		return -EINVAL;
+
+	if (copy_from_user(&args, uargs, 3 * sizeof(u32)) != 0)
+		return -EFAULT;
+
+	nargs = be32_to_cpu(args.nargs);
+	nret  = be32_to_cpu(args.nret);
+	token = be32_to_cpu(args.token);
+
+	if (nargs >= ARRAY_SIZE(args.args)
+	    || nret > ARRAY_SIZE(args.args)
+	    || nargs + nret > ARRAY_SIZE(args.args))
+		return -EINVAL;
+
+	/* Copy in args. */
+	if (copy_from_user(args.args, uargs->args,
+			   nargs * sizeof(rtas_arg_t)) != 0)
+		return -EFAULT;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	args.rets = &args.args[nargs];
+	memset(args.rets, 0, nret * sizeof(rtas_arg_t));
+
+	if (block_rtas_call(token, nargs, &args))
+		return -EINVAL;
+
+	/* Need to handle ibm,suspend_me call specially */
+	if (token == ibm_suspend_me_token) {
+
+		/*
+		 * rtas_ibm_suspend_me assumes the streamid handle is in cpu
+		 * endian, or at least the hcall within it requires it.
+		 */
+		int rc = 0;
+		u64 handle = ((u64)be32_to_cpu(args.args[0]) << 32)
+		              | be32_to_cpu(args.args[1]);
+		rc = rtas_ibm_suspend_me(handle);
+		if (rc == -EAGAIN)
+			args.rets[0] = cpu_to_be32(RTAS_NOT_SUSPENDABLE);
+		else if (rc == -EIO)
+			args.rets[0] = cpu_to_be32(-1);
+		else if (rc)
+			return rc;
+		goto copy_return;
+	}
+
+	buff_copy = get_errorlog_buffer();
+
+	flags = lock_rtas();
+
+	rtas.args = args;
+	enter_rtas(__pa(&rtas.args));
+	args = rtas.args;
+
+	/* A -1 return code indicates that the last command couldn't
+	   be completed due to a hardware error. */
+	if (be32_to_cpu(args.rets[0]) == -1)
+		errbuf = __fetch_rtas_last_error(buff_copy);
+
+	unlock_rtas(flags);
+
+	if (buff_copy) {
+		if (errbuf)
+			log_error(errbuf, ERR_TYPE_RTAS_LOG, 0);
+		kfree(buff_copy);
+	}
+
+ copy_return:
+	/* Copy out args. */
+	if (copy_to_user(uargs->args + nargs,
+			 args.args + nargs,
+			 nret * sizeof(rtas_arg_t)) != 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Call early during boot, before mem init, to retrieve the RTAS
+ * information from the device-tree and allocate the RMO buffer for userland
+ * accesses.
+ */
+void __init rtas_initialize(void)
+{
+	unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
+	u32 base, size, entry;
+	int no_base, no_size, no_entry;
+#ifdef CONFIG_PPC_RTAS_FILTER
+	int i;
+#endif
+
+	/* Get RTAS dev node and fill up our "rtas" structure with infos
+	 * about it.
+	 */
+	rtas.dev = of_find_node_by_name(NULL, "rtas");
+	if (!rtas.dev)
+		return;
+
+	no_base = of_property_read_u32(rtas.dev, "linux,rtas-base", &base);
+	no_size = of_property_read_u32(rtas.dev, "rtas-size", &size);
+	if (no_base || no_size) {
+		of_node_put(rtas.dev);
+		rtas.dev = NULL;
+		return;
+	}
+
+	rtas.base = base;
+	rtas.size = size;
+	no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry);
+	rtas.entry = no_entry ? rtas.base : entry;
+
+	/* If RTAS was found, allocate the RMO buffer for it and look for
+	 * the stop-self token if any
+	 */
+#ifdef CONFIG_PPC64
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		rtas_region = min(ppc64_rma_size, RTAS_INSTANTIATE_MAX);
+		ibm_suspend_me_token = rtas_token("ibm,suspend-me");
+	}
+#endif
+	rtas_rmo_buf = memblock_alloc_base(RTAS_RMOBUF_MAX, PAGE_SIZE, rtas_region);
+
+#ifdef CONFIG_RTAS_ERROR_LOGGING
+	rtas_last_error_token = rtas_token("rtas-last-error");
+#endif
+
+#ifdef CONFIG_PPC_RTAS_FILTER
+	for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
+		rtas_filters[i].token = rtas_token(rtas_filters[i].name);
+	}
+#endif
+}
+
+int __init early_init_dt_scan_rtas(unsigned long node,
+		const char *uname, int depth, void *data)
+{
+	const u32 *basep, *entryp, *sizep;
+
+	if (depth != 1 || strcmp(uname, "rtas") != 0)
+		return 0;
+
+	basep  = of_get_flat_dt_prop(node, "linux,rtas-base", NULL);
+	entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
+	sizep  = of_get_flat_dt_prop(node, "rtas-size", NULL);
+
+#ifdef CONFIG_PPC64
+	/* need this feature to decide the crashkernel offset */
+	if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))
+		powerpc_firmware_features |= FW_FEATURE_LPAR;
+#endif
+
+	if (basep && entryp && sizep) {
+		rtas.base = *basep;
+		rtas.entry = *entryp;
+		rtas.size = *sizep;
+	}
+
+#ifdef CONFIG_UDBG_RTAS_CONSOLE
+	basep = of_get_flat_dt_prop(node, "put-term-char", NULL);
+	if (basep)
+		rtas_putchar_token = *basep;
+
+	basep = of_get_flat_dt_prop(node, "get-term-char", NULL);
+	if (basep)
+		rtas_getchar_token = *basep;
+
+	if (rtas_putchar_token != RTAS_UNKNOWN_SERVICE &&
+	    rtas_getchar_token != RTAS_UNKNOWN_SERVICE)
+		udbg_init_rtas_console();
+
+#endif
+
+	/* break now */
+	return 1;
+}
+
+static arch_spinlock_t timebase_lock;
+static u64 timebase = 0;
+
+void rtas_give_timebase(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	hard_irq_disable();
+	arch_spin_lock(&timebase_lock);
+	rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL);
+	timebase = get_tb();
+	arch_spin_unlock(&timebase_lock);
+
+	while (timebase)
+		barrier();
+	rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL);
+	local_irq_restore(flags);
+}
+
+void rtas_take_timebase(void)
+{
+	while (!timebase)
+		barrier();
+	arch_spin_lock(&timebase_lock);
+	set_tb(timebase >> 32, timebase & 0xffffffff);
+	timebase = 0;
+	arch_spin_unlock(&timebase_lock);
+}
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
new file mode 100644
index 000000000..10fabae25
--- /dev/null
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -0,0 +1,781 @@
+/*
+ *  c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * /proc/powerpc/rtas/firmware_flash interface
+ *
+ * This file implements a firmware_flash interface to pump a firmware
+ * image into the kernel.  At reboot time rtas_restart() will see the
+ * firmware image and flash it as it reboots (see rtas.c).
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/reboot.h>
+#include <asm/delay.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "rtas_flash"
+
+#define FIRMWARE_FLASH_NAME "firmware_flash"   
+#define FIRMWARE_UPDATE_NAME "firmware_update"
+#define MANAGE_FLASH_NAME "manage_flash"
+#define VALIDATE_FLASH_NAME "validate_flash"
+
+/* General RTAS Status Codes */
+#define RTAS_RC_SUCCESS  0
+#define RTAS_RC_HW_ERR	-1
+#define RTAS_RC_BUSY	-2
+
+/* Flash image status values */
+#define FLASH_AUTH           -9002 /* RTAS Not Service Authority Partition */
+#define FLASH_NO_OP          -1099 /* No operation initiated by user */	
+#define FLASH_IMG_SHORT	     -1005 /* Flash image shorter than expected */
+#define FLASH_IMG_BAD_LEN    -1004 /* Bad length value in flash list block */
+#define FLASH_IMG_NULL_DATA  -1003 /* Bad data value in flash list block */
+#define FLASH_IMG_READY      0     /* Firmware img ready for flash on reboot */
+
+/* Manage image status values */
+#define MANAGE_AUTH          -9002 /* RTAS Not Service Authority Partition */
+#define MANAGE_ACTIVE_ERR    -9001 /* RTAS Cannot Overwrite Active Img */
+#define MANAGE_NO_OP         -1099 /* No operation initiated by user */
+#define MANAGE_PARAM_ERR     -3    /* RTAS Parameter Error */
+#define MANAGE_HW_ERR        -1    /* RTAS Hardware Error */
+
+/* Validate image status values */
+#define VALIDATE_AUTH          -9002 /* RTAS Not Service Authority Partition */
+#define VALIDATE_NO_OP         -1099 /* No operation initiated by the user */
+#define VALIDATE_INCOMPLETE    -1002 /* User copied < VALIDATE_BUF_SIZE */
+#define VALIDATE_READY	       -1001 /* Firmware image ready for validation */
+#define VALIDATE_PARAM_ERR     -3    /* RTAS Parameter Error */
+#define VALIDATE_HW_ERR        -1    /* RTAS Hardware Error */
+
+/* ibm,validate-flash-image update result tokens */
+#define VALIDATE_TMP_UPDATE    0     /* T side will be updated */
+#define VALIDATE_FLASH_AUTH    1     /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG   2     /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN   3     /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL 4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT    5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL 6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY  7
+
+/* ibm,manage-flash-image operation tokens */
+#define RTAS_REJECT_TMP_IMG   0
+#define RTAS_COMMIT_TMP_IMG   1
+
+/* Array sizes */
+#define VALIDATE_BUF_SIZE 4096    
+#define VALIDATE_MSG_LEN  256
+#define RTAS_MSG_MAXLEN   64
+
+/* Quirk - RTAS requires 4k list length and block size */
+#define RTAS_BLKLIST_LENGTH 4096
+#define RTAS_BLK_SIZE 4096
+
+struct flash_block {
+	char *data;
+	unsigned long length;
+};
+
+/* This struct is very similar but not identical to
+ * that needed by the rtas flash update.
+ * All we need to do for rtas is rewrite num_blocks
+ * into a version/length and translate the pointers
+ * to absolute.
+ */
+#define FLASH_BLOCKS_PER_NODE ((RTAS_BLKLIST_LENGTH - 16) / sizeof(struct flash_block))
+struct flash_block_list {
+	unsigned long num_blocks;
+	struct flash_block_list *next;
+	struct flash_block blocks[FLASH_BLOCKS_PER_NODE];
+};
+
+static struct flash_block_list *rtas_firmware_flash_list;
+
+/* Use slab cache to guarantee 4k alignment */
+static struct kmem_cache *flash_block_cache = NULL;
+
+#define FLASH_BLOCK_LIST_VERSION (1UL)
+
+/*
+ * Local copy of the flash block list.
+ *
+ * The rtas_firmware_flash_list varable will be
+ * set once the data is fully read.
+ *
+ * For convenience as we build the list we use virtual addrs,
+ * we do not fill in the version number, and the length field
+ * is treated as the number of entries currently in the block
+ * (i.e. not a byte count).  This is all fixed when calling 
+ * the flash routine.
+ */
+
+/* Status int must be first member of struct */
+struct rtas_update_flash_t
+{
+	int status;			/* Flash update status */
+	struct flash_block_list *flist; /* Local copy of flash block list */
+};
+
+/* Status int must be first member of struct */
+struct rtas_manage_flash_t
+{
+	int status;			/* Returned status */
+};
+
+/* Status int must be first member of struct */
+struct rtas_validate_flash_t
+{
+	int status;		 	/* Returned status */	
+	char *buf;			/* Candidate image buffer */
+	unsigned int buf_size;		/* Size of image buf */
+	unsigned int update_results;	/* Update results token */
+};
+
+static struct rtas_update_flash_t rtas_update_flash_data;
+static struct rtas_manage_flash_t rtas_manage_flash_data;
+static struct rtas_validate_flash_t rtas_validate_flash_data;
+static DEFINE_MUTEX(rtas_update_flash_mutex);
+static DEFINE_MUTEX(rtas_manage_flash_mutex);
+static DEFINE_MUTEX(rtas_validate_flash_mutex);
+
+/* Do simple sanity checks on the flash image. */
+static int flash_list_valid(struct flash_block_list *flist)
+{
+	struct flash_block_list *f;
+	int i;
+	unsigned long block_size, image_size;
+
+	/* Paranoid self test here.  We also collect the image size. */
+	image_size = 0;
+	for (f = flist; f; f = f->next) {
+		for (i = 0; i < f->num_blocks; i++) {
+			if (f->blocks[i].data == NULL) {
+				return FLASH_IMG_NULL_DATA;
+			}
+			block_size = f->blocks[i].length;
+			if (block_size <= 0 || block_size > RTAS_BLK_SIZE) {
+				return FLASH_IMG_BAD_LEN;
+			}
+			image_size += block_size;
+		}
+	}
+
+	if (image_size < (256 << 10)) {
+		if (image_size < 2) 
+			return FLASH_NO_OP;
+	}
+
+	printk(KERN_INFO "FLASH: flash image with %ld bytes stored for hardware flash on reboot\n", image_size);
+
+	return FLASH_IMG_READY;
+}
+
+static void free_flash_list(struct flash_block_list *f)
+{
+	struct flash_block_list *next;
+	int i;
+
+	while (f) {
+		for (i = 0; i < f->num_blocks; i++)
+			kmem_cache_free(flash_block_cache, f->blocks[i].data);
+		next = f->next;
+		kmem_cache_free(flash_block_cache, f);
+		f = next;
+	}
+}
+
+static int rtas_flash_release(struct inode *inode, struct file *file)
+{
+	struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
+
+	mutex_lock(&rtas_update_flash_mutex);
+
+	if (uf->flist) {    
+		/* File was opened in write mode for a new flash attempt */
+		/* Clear saved list */
+		if (rtas_firmware_flash_list) {
+			free_flash_list(rtas_firmware_flash_list);
+			rtas_firmware_flash_list = NULL;
+		}
+
+		if (uf->status != FLASH_AUTH)  
+			uf->status = flash_list_valid(uf->flist);
+
+		if (uf->status == FLASH_IMG_READY) 
+			rtas_firmware_flash_list = uf->flist;
+		else
+			free_flash_list(uf->flist);
+
+		uf->flist = NULL;
+	}
+
+	mutex_unlock(&rtas_update_flash_mutex);
+	return 0;
+}
+
+static size_t get_flash_status_msg(int status, char *buf)
+{
+	const char *msg;
+	size_t len;
+
+	switch (status) {
+	case FLASH_AUTH:
+		msg = "error: this partition does not have service authority\n";
+		break;
+	case FLASH_NO_OP:
+		msg = "info: no firmware image for flash\n";
+		break;
+	case FLASH_IMG_SHORT:
+		msg = "error: flash image short\n";
+		break;
+	case FLASH_IMG_BAD_LEN:
+		msg = "error: internal error bad length\n";
+		break;
+	case FLASH_IMG_NULL_DATA:
+		msg = "error: internal error null data\n";
+		break;
+	case FLASH_IMG_READY:
+		msg = "ready: firmware image ready for flash on reboot\n";
+		break;
+	default:
+		return sprintf(buf, "error: unexpected status value %d\n",
+			       status);
+	}
+
+	len = strlen(msg);
+	memcpy(buf, msg, len + 1);
+	return len;
+}
+
+/* Reading the proc file will show status (not the firmware contents) */
+static ssize_t rtas_flash_read_msg(struct file *file, char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
+	char msg[RTAS_MSG_MAXLEN];
+	size_t len;
+	int status;
+
+	mutex_lock(&rtas_update_flash_mutex);
+	status = uf->status;
+	mutex_unlock(&rtas_update_flash_mutex);
+
+	/* Read as text message */
+	len = get_flash_status_msg(status, msg);
+	return simple_read_from_buffer(buf, count, ppos, msg, len);
+}
+
+static ssize_t rtas_flash_read_num(struct file *file, char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
+	char msg[RTAS_MSG_MAXLEN];
+	int status;
+
+	mutex_lock(&rtas_update_flash_mutex);
+	status = uf->status;
+	mutex_unlock(&rtas_update_flash_mutex);
+
+	/* Read as number */
+	sprintf(msg, "%d\n", status);
+	return simple_read_from_buffer(buf, count, ppos, msg, strlen(msg));
+}
+
+/* We could be much more efficient here.  But to keep this function
+ * simple we allocate a page to the block list no matter how small the
+ * count is.  If the system is low on memory it will be just as well
+ * that we fail....
+ */
+static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
+	char *p;
+	int next_free, rc;
+	struct flash_block_list *fl;
+
+	mutex_lock(&rtas_update_flash_mutex);
+
+	if (uf->status == FLASH_AUTH || count == 0)
+		goto out;	/* discard data */
+
+	/* In the case that the image is not ready for flashing, the memory
+	 * allocated for the block list will be freed upon the release of the 
+	 * proc file
+	 */
+	if (uf->flist == NULL) {
+		uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
+		if (!uf->flist)
+			goto nomem;
+	}
+
+	fl = uf->flist;
+	while (fl->next)
+		fl = fl->next; /* seek to last block_list for append */
+	next_free = fl->num_blocks;
+	if (next_free == FLASH_BLOCKS_PER_NODE) {
+		/* Need to allocate another block_list */
+		fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
+		if (!fl->next)
+			goto nomem;
+		fl = fl->next;
+		next_free = 0;
+	}
+
+	if (count > RTAS_BLK_SIZE)
+		count = RTAS_BLK_SIZE;
+	p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
+	if (!p)
+		goto nomem;
+	
+	if(copy_from_user(p, buffer, count)) {
+		kmem_cache_free(flash_block_cache, p);
+		rc = -EFAULT;
+		goto error;
+	}
+	fl->blocks[next_free].data = p;
+	fl->blocks[next_free].length = count;
+	fl->num_blocks++;
+out:
+	mutex_unlock(&rtas_update_flash_mutex);
+	return count;
+
+nomem:
+	rc = -ENOMEM;
+error:
+	mutex_unlock(&rtas_update_flash_mutex);
+	return rc;
+}
+
+/*
+ * Flash management routines.
+ */
+static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op)
+{
+	s32 rc;
+
+	do {
+		rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1,
+			       NULL, op);
+	} while (rtas_busy_delay(rc));
+
+	args_buf->status = rc;
+}
+
+static ssize_t manage_flash_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data;
+	char msg[RTAS_MSG_MAXLEN];
+	int msglen, status;
+
+	mutex_lock(&rtas_manage_flash_mutex);
+	status = args_buf->status;
+	mutex_unlock(&rtas_manage_flash_mutex);
+
+	msglen = sprintf(msg, "%d\n", status);
+	return simple_read_from_buffer(buf, count, ppos, msg, msglen);
+}
+
+static ssize_t manage_flash_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *off)
+{
+	struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data;
+	static const char reject_str[] = "0";
+	static const char commit_str[] = "1";
+	char stkbuf[10];
+	int op, rc;
+
+	mutex_lock(&rtas_manage_flash_mutex);
+
+	if ((args_buf->status == MANAGE_AUTH) || (count == 0))
+		goto out;
+		
+	op = -1;
+	if (buf) {
+		if (count > 9) count = 9;
+		rc = -EFAULT;
+		if (copy_from_user (stkbuf, buf, count))
+			goto error;
+		if (strncmp(stkbuf, reject_str, strlen(reject_str)) == 0) 
+			op = RTAS_REJECT_TMP_IMG;
+		else if (strncmp(stkbuf, commit_str, strlen(commit_str)) == 0) 
+			op = RTAS_COMMIT_TMP_IMG;
+	}
+	
+	if (op == -1) {   /* buf is empty, or contains invalid string */
+		rc = -EINVAL;
+		goto error;
+	}
+
+	manage_flash(args_buf, op);
+out:
+	mutex_unlock(&rtas_manage_flash_mutex);
+	return count;
+
+error:
+	mutex_unlock(&rtas_manage_flash_mutex);
+	return rc;
+}
+
+/*
+ * Validation routines.
+ */
+static void validate_flash(struct rtas_validate_flash_t *args_buf)
+{
+	int token = rtas_token("ibm,validate-flash-image");
+	int update_results;
+	s32 rc;	
+
+	rc = 0;
+	do {
+		spin_lock(&rtas_data_buf_lock);
+		memcpy(rtas_data_buf, args_buf->buf, VALIDATE_BUF_SIZE);
+		rc = rtas_call(token, 2, 2, &update_results, 
+			       (u32) __pa(rtas_data_buf), args_buf->buf_size);
+		memcpy(args_buf->buf, rtas_data_buf, VALIDATE_BUF_SIZE);
+		spin_unlock(&rtas_data_buf_lock);
+	} while (rtas_busy_delay(rc));
+
+	args_buf->status = rc;
+	args_buf->update_results = update_results;
+}
+
+static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, 
+		                   char *msg, int msglen)
+{
+	int n;
+
+	if (args_buf->status >= VALIDATE_TMP_UPDATE) { 
+		n = sprintf(msg, "%d\n", args_buf->update_results);
+		if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) ||
+		    (args_buf->update_results == VALIDATE_TMP_UPDATE))
+			n += snprintf(msg + n, msglen - n, "%s\n",
+					args_buf->buf);
+	} else {
+		n = sprintf(msg, "%d\n", args_buf->status);
+	}
+	return n;
+}
+
+static ssize_t validate_flash_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct rtas_validate_flash_t *const args_buf =
+		&rtas_validate_flash_data;
+	char msg[VALIDATE_MSG_LEN];
+	int msglen;
+
+	mutex_lock(&rtas_validate_flash_mutex);
+	msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN);
+	mutex_unlock(&rtas_validate_flash_mutex);
+
+	return simple_read_from_buffer(buf, count, ppos, msg, msglen);
+}
+
+static ssize_t validate_flash_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *off)
+{
+	struct rtas_validate_flash_t *const args_buf =
+		&rtas_validate_flash_data;
+	int rc;
+
+	mutex_lock(&rtas_validate_flash_mutex);
+
+	/* We are only interested in the first 4K of the
+	 * candidate image */
+	if ((*off >= VALIDATE_BUF_SIZE) || 
+		(args_buf->status == VALIDATE_AUTH)) {
+		*off += count;
+		mutex_unlock(&rtas_validate_flash_mutex);
+		return count;
+	}
+
+	if (*off + count >= VALIDATE_BUF_SIZE)  {
+		count = VALIDATE_BUF_SIZE - *off;
+		args_buf->status = VALIDATE_READY;	
+	} else {
+		args_buf->status = VALIDATE_INCOMPLETE;
+	}
+
+	if (!access_ok(VERIFY_READ, buf, count)) {
+		rc = -EFAULT;
+		goto done;
+	}
+	if (copy_from_user(args_buf->buf + *off, buf, count)) {
+		rc = -EFAULT;
+		goto done;
+	}
+
+	*off += count;
+	rc = count;
+done:
+	mutex_unlock(&rtas_validate_flash_mutex);
+	return rc;
+}
+
+static int validate_flash_release(struct inode *inode, struct file *file)
+{
+	struct rtas_validate_flash_t *const args_buf =
+		&rtas_validate_flash_data;
+
+	mutex_lock(&rtas_validate_flash_mutex);
+
+	if (args_buf->status == VALIDATE_READY) {
+		args_buf->buf_size = VALIDATE_BUF_SIZE;
+		validate_flash(args_buf);
+	}
+
+	mutex_unlock(&rtas_validate_flash_mutex);
+	return 0;
+}
+
+/*
+ * On-reboot flash update applicator.
+ */
+static void rtas_flash_firmware(int reboot_type)
+{
+	unsigned long image_size;
+	struct flash_block_list *f, *next, *flist;
+	unsigned long rtas_block_list;
+	int i, status, update_token;
+
+	if (rtas_firmware_flash_list == NULL)
+		return;		/* nothing to do */
+
+	if (reboot_type != SYS_RESTART) {
+		printk(KERN_ALERT "FLASH: firmware flash requires a reboot\n");
+		printk(KERN_ALERT "FLASH: the firmware image will NOT be flashed\n");
+		return;
+	}
+
+	update_token = rtas_token("ibm,update-flash-64-and-reboot");
+	if (update_token == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot "
+		       "is not available -- not a service partition?\n");
+		printk(KERN_ALERT "FLASH: firmware will not be flashed\n");
+		return;
+	}
+
+	/*
+	 * Just before starting the firmware flash, cancel the event scan work
+	 * to avoid any soft lockup issues.
+	 */
+	rtas_cancel_event_scan();
+
+	/*
+	 * NOTE: the "first" block must be under 4GB, so we create
+	 * an entry with no data blocks in the reserved buffer in
+	 * the kernel data segment.
+	 */
+	spin_lock(&rtas_data_buf_lock);
+	flist = (struct flash_block_list *)&rtas_data_buf[0];
+	flist->num_blocks = 0;
+	flist->next = rtas_firmware_flash_list;
+	rtas_block_list = __pa(flist);
+	if (rtas_block_list >= 4UL*1024*1024*1024) {
+		printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n");
+		spin_unlock(&rtas_data_buf_lock);
+		return;
+	}
+
+	printk(KERN_ALERT "FLASH: preparing saved firmware image for flash\n");
+	/* Update the block_list in place. */
+	rtas_firmware_flash_list = NULL; /* too hard to backout on error */
+	image_size = 0;
+	for (f = flist; f; f = next) {
+		/* Translate data addrs to absolute */
+		for (i = 0; i < f->num_blocks; i++) {
+			f->blocks[i].data = (char *)cpu_to_be64(__pa(f->blocks[i].data));
+			image_size += f->blocks[i].length;
+			f->blocks[i].length = cpu_to_be64(f->blocks[i].length);
+		}
+		next = f->next;
+		/* Don't translate NULL pointer for last entry */
+		if (f->next)
+			f->next = (struct flash_block_list *)cpu_to_be64(__pa(f->next));
+		else
+			f->next = NULL;
+		/* make num_blocks into the version/length field */
+		f->num_blocks = (FLASH_BLOCK_LIST_VERSION << 56) | ((f->num_blocks+1)*16);
+		f->num_blocks = cpu_to_be64(f->num_blocks);
+	}
+
+	printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size);
+	printk(KERN_ALERT "FLASH: performing flash and reboot\n");
+	rtas_progress("Flashing        \n", 0x0);
+	rtas_progress("Please Wait...  ", 0x0);
+	printk(KERN_ALERT "FLASH: this will take several minutes.  Do not power off!\n");
+	status = rtas_call(update_token, 1, 1, NULL, rtas_block_list);
+	switch (status) {	/* should only get "bad" status */
+	    case 0:
+		printk(KERN_ALERT "FLASH: success\n");
+		break;
+	    case -1:
+		printk(KERN_ALERT "FLASH: hardware error.  Firmware may not be not flashed\n");
+		break;
+	    case -3:
+		printk(KERN_ALERT "FLASH: image is corrupt or not correct for this platform.  Firmware not flashed\n");
+		break;
+	    case -4:
+		printk(KERN_ALERT "FLASH: flash failed when partially complete.  System may not reboot\n");
+		break;
+	    default:
+		printk(KERN_ALERT "FLASH: unknown flash return code %d\n", status);
+		break;
+	}
+	spin_unlock(&rtas_data_buf_lock);
+}
+
+/*
+ * Manifest of proc files to create
+ */
+struct rtas_flash_file {
+	const char *filename;
+	const char *rtas_call_name;
+	int *status;
+	const struct file_operations fops;
+};
+
+static const struct rtas_flash_file rtas_flash_files[] = {
+	{
+		.filename	= "powerpc/rtas/" FIRMWARE_FLASH_NAME,
+		.rtas_call_name	= "ibm,update-flash-64-and-reboot",
+		.status		= &rtas_update_flash_data.status,
+		.fops.read	= rtas_flash_read_msg,
+		.fops.write	= rtas_flash_write,
+		.fops.release	= rtas_flash_release,
+		.fops.llseek	= default_llseek,
+	},
+	{
+		.filename	= "powerpc/rtas/" FIRMWARE_UPDATE_NAME,
+		.rtas_call_name	= "ibm,update-flash-64-and-reboot",
+		.status		= &rtas_update_flash_data.status,
+		.fops.read	= rtas_flash_read_num,
+		.fops.write	= rtas_flash_write,
+		.fops.release	= rtas_flash_release,
+		.fops.llseek	= default_llseek,
+	},
+	{
+		.filename	= "powerpc/rtas/" VALIDATE_FLASH_NAME,
+		.rtas_call_name	= "ibm,validate-flash-image",
+		.status		= &rtas_validate_flash_data.status,
+		.fops.read	= validate_flash_read,
+		.fops.write	= validate_flash_write,
+		.fops.release	= validate_flash_release,
+		.fops.llseek	= default_llseek,
+	},
+	{
+		.filename	= "powerpc/rtas/" MANAGE_FLASH_NAME,
+		.rtas_call_name	= "ibm,manage-flash-image",
+		.status		= &rtas_manage_flash_data.status,
+		.fops.read	= manage_flash_read,
+		.fops.write	= manage_flash_write,
+		.fops.llseek	= default_llseek,
+	}
+};
+
+static int __init rtas_flash_init(void)
+{
+	int i;
+
+	if (rtas_token("ibm,update-flash-64-and-reboot") ==
+		       RTAS_UNKNOWN_SERVICE) {
+		pr_info("rtas_flash: no firmware flash support\n");
+		return -EINVAL;
+	}
+
+	rtas_validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
+	if (!rtas_validate_flash_data.buf)
+		return -ENOMEM;
+
+	flash_block_cache = kmem_cache_create("rtas_flash_cache",
+					      RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0,
+					      NULL);
+	if (!flash_block_cache) {
+		printk(KERN_ERR "%s: failed to create block cache\n",
+				__func__);
+		goto enomem_buf;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) {
+		const struct rtas_flash_file *f = &rtas_flash_files[i];
+		int token;
+
+		if (!proc_create(f->filename, 0600, NULL, &f->fops))
+			goto enomem;
+
+		/*
+		 * This code assumes that the status int is the first member of the
+		 * struct
+		 */
+		token = rtas_token(f->rtas_call_name);
+		if (token == RTAS_UNKNOWN_SERVICE)
+			*f->status = FLASH_AUTH;
+		else
+			*f->status = FLASH_NO_OP;
+	}
+
+	rtas_flash_term_hook = rtas_flash_firmware;
+	return 0;
+
+enomem:
+	while (--i >= 0) {
+		const struct rtas_flash_file *f = &rtas_flash_files[i];
+		remove_proc_entry(f->filename, NULL);
+	}
+
+	kmem_cache_destroy(flash_block_cache);
+enomem_buf:
+	kfree(rtas_validate_flash_data.buf);
+	return -ENOMEM;
+}
+
+static void __exit rtas_flash_cleanup(void)
+{
+	int i;
+
+	rtas_flash_term_hook = NULL;
+
+	if (rtas_firmware_flash_list) {
+		free_flash_list(rtas_firmware_flash_list);
+		rtas_firmware_flash_list = NULL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) {
+		const struct rtas_flash_file *f = &rtas_flash_files[i];
+		remove_proc_entry(f->filename, NULL);
+	}
+
+	kmem_cache_destroy(flash_block_cache);
+	kfree(rtas_validate_flash_data.buf);
+}
+
+module_init(rtas_flash_init);
+module_exit(rtas_flash_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
new file mode 100644
index 000000000..c2b148b16
--- /dev/null
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * RTAS specific routines for PCI.
+ *
+ * Based on code from pci.c, chrp_pci.c and pSeries_pci.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/rtas.h>
+#include <asm/mpic.h>
+#include <asm/ppc-pci.h>
+#include <asm/eeh.h>
+
+/* RTAS tokens */
+static int read_pci_config;
+static int write_pci_config;
+static int ibm_read_pci_config;
+static int ibm_write_pci_config;
+
+static inline int config_access_valid(struct pci_dn *dn, int where)
+{
+	if (where < 256)
+		return 1;
+	if (where < 4096 && dn->pci_ext_config_space)
+		return 1;
+
+	return 0;
+}
+
+int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+{
+	int returnval = -1;
+	unsigned long buid, addr;
+	int ret;
+
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (!config_access_valid(pdn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+#ifdef CONFIG_EEH
+	if (pdn->edev && pdn->edev->pe &&
+	    (pdn->edev->pe->state & EEH_PE_CFG_BLOCKED))
+		return PCIBIOS_SET_FAILED;
+#endif
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
+	buid = pdn->phb->buid;
+	if (buid) {
+		ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval,
+				addr, BUID_HI(buid), BUID_LO(buid), size);
+	} else {
+		ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size);
+	}
+	*val = returnval;
+
+	if (ret)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int rtas_pci_read_config(struct pci_bus *bus,
+				unsigned int devfn,
+				int where, int size, u32 *val)
+{
+	struct pci_dn *pdn;
+	int ret;
+
+	*val = 0xFFFFFFFF;
+
+	pdn = pci_get_pdn_by_devfn(bus, devfn);
+
+	/* Validity of pdn is checked in here */
+	ret = rtas_read_config(pdn, where, size, val);
+	if (*val == EEH_IO_ERROR_VALUE(size) &&
+	    eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return ret;
+}
+
+int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val)
+{
+	unsigned long buid, addr;
+	int ret;
+
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (!config_access_valid(pdn, where))
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+#ifdef CONFIG_EEH
+	if (pdn->edev && pdn->edev->pe &&
+	    (pdn->edev->pe->state & EEH_PE_CFG_BLOCKED))
+		return PCIBIOS_SET_FAILED;
+#endif
+
+	addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
+	buid = pdn->phb->buid;
+	if (buid) {
+		ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
+			BUID_HI(buid), BUID_LO(buid), size, (ulong) val);
+	} else {
+		ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val);
+	}
+
+	if (ret)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int rtas_pci_write_config(struct pci_bus *bus,
+				 unsigned int devfn,
+				 int where, int size, u32 val)
+{
+	struct pci_dn *pdn;
+
+	pdn = pci_get_pdn_by_devfn(bus, devfn);
+
+	/* Validity of pdn is checked in here. */
+	return rtas_write_config(pdn, where, size, val);
+}
+
+static struct pci_ops rtas_pci_ops = {
+	.read = rtas_pci_read_config,
+	.write = rtas_pci_write_config,
+};
+
+static int is_python(struct device_node *dev)
+{
+	const char *model = of_get_property(dev, "model", NULL);
+
+	if (model && strstr(model, "Python"))
+		return 1;
+
+	return 0;
+}
+
+static void python_countermeasures(struct device_node *dev)
+{
+	struct resource registers;
+	void __iomem *chip_regs;
+	volatile u32 val;
+
+	if (of_address_to_resource(dev, 0, &registers)) {
+		printk(KERN_ERR "Can't get address for Python workarounds !\n");
+		return;
+	}
+
+	/* Python's register file is 1 MB in size. */
+	chip_regs = ioremap(registers.start & ~(0xfffffUL), 0x100000);
+
+	/*
+	 * Firmware doesn't always clear this bit which is critical
+	 * for good performance - Anton
+	 */
+
+#define PRG_CL_RESET_VALID 0x00010000
+
+	val = in_be32(chip_regs + 0xf6030);
+	if (val & PRG_CL_RESET_VALID) {
+		printk(KERN_INFO "Python workaround: ");
+		val &= ~PRG_CL_RESET_VALID;
+		out_be32(chip_regs + 0xf6030, val);
+		/*
+		 * We must read it back for changes to
+		 * take effect
+		 */
+		val = in_be32(chip_regs + 0xf6030);
+		printk("reg0: %x\n", val);
+	}
+
+	iounmap(chip_regs);
+}
+
+void __init init_pci_config_tokens(void)
+{
+	read_pci_config = rtas_token("read-pci-config");
+	write_pci_config = rtas_token("write-pci-config");
+	ibm_read_pci_config = rtas_token("ibm,read-pci-config");
+	ibm_write_pci_config = rtas_token("ibm,write-pci-config");
+}
+
+unsigned long get_phb_buid(struct device_node *phb)
+{
+	struct resource r;
+
+	if (ibm_read_pci_config == -1)
+		return 0;
+	if (of_address_to_resource(phb, 0, &r))
+		return 0;
+	return r.start;
+}
+
+static int phb_set_bus_ranges(struct device_node *dev,
+			      struct pci_controller *phb)
+{
+	const __be32 *bus_range;
+	unsigned int len;
+
+	bus_range = of_get_property(dev, "bus-range", &len);
+	if (bus_range == NULL || len < 2 * sizeof(int)) {
+		return 1;
+ 	}
+
+	phb->first_busno = be32_to_cpu(bus_range[0]);
+	phb->last_busno  = be32_to_cpu(bus_range[1]);
+
+	return 0;
+}
+
+int rtas_setup_phb(struct pci_controller *phb)
+{
+	struct device_node *dev = phb->dn;
+
+	if (is_python(dev))
+		python_countermeasures(dev);
+
+	if (phb_set_bus_ranges(dev, phb))
+		return 1;
+
+	phb->ops = &rtas_pci_ops;
+	phb->buid = get_phb_buid(dev);
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
new file mode 100644
index 000000000..23b88b923
--- /dev/null
+++ b/arch/powerpc/kernel/rtasd.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Communication to userspace based on kernel/printk.c
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/nvram.h>
+#include <linux/atomic.h>
+#include <asm/machdep.h>
+#include <asm/topology.h>
+
+
+static DEFINE_SPINLOCK(rtasd_log_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
+
+static char *rtas_log_buf;
+static unsigned long rtas_log_start;
+static unsigned long rtas_log_size;
+
+static int surveillance_timeout = -1;
+
+static unsigned int rtas_error_log_max;
+static unsigned int rtas_error_log_buffer_max;
+
+/* RTAS service tokens */
+static unsigned int event_scan;
+static unsigned int rtas_event_scan_rate;
+
+static bool full_rtas_msgs;
+
+/* Stop logging to nvram after first fatal error */
+static int logging_enabled; /* Until we initialize everything,
+                             * make sure we don't try logging
+                             * anything */
+static int error_log_cnt;
+
+/*
+ * Since we use 32 bit RTAS, the physical address of this must be below
+ * 4G or else bad things happen. Allocate this in the kernel data and
+ * make it big enough.
+ */
+static unsigned char logdata[RTAS_ERROR_LOG_MAX];
+
+static char *rtas_type[] = {
+	"Unknown", "Retry", "TCE Error", "Internal Device Failure",
+	"Timeout", "Data Parity", "Address Parity", "Cache Parity",
+	"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
+};
+
+static char *rtas_event_type(int type)
+{
+	if ((type > 0) && (type < 11))
+		return rtas_type[type];
+
+	switch (type) {
+		case RTAS_TYPE_EPOW:
+			return "EPOW";
+		case RTAS_TYPE_PLATFORM:
+			return "Platform Error";
+		case RTAS_TYPE_IO:
+			return "I/O Event";
+		case RTAS_TYPE_INFO:
+			return "Platform Information Event";
+		case RTAS_TYPE_DEALLOC:
+			return "Resource Deallocation Event";
+		case RTAS_TYPE_DUMP:
+			return "Dump Notification Event";
+		case RTAS_TYPE_PRRN:
+			return "Platform Resource Reassignment Event";
+	}
+
+	return rtas_type[0];
+}
+
+/* To see this info, grep RTAS /var/log/messages and each entry
+ * will be collected together with obvious begin/end.
+ * There will be a unique identifier on the begin and end lines.
+ * This will persist across reboots.
+ *
+ * format of error logs returned from RTAS:
+ * bytes	(size)	: contents
+ * --------------------------------------------------------
+ * 0-7		(8)	: rtas_error_log
+ * 8-47		(40)	: extended info
+ * 48-51	(4)	: vendor id
+ * 52-1023 (vendor specific) : location code and debug data
+ */
+static void printk_log_rtas(char *buf, int len)
+{
+
+	int i,j,n = 0;
+	int perline = 16;
+	char buffer[64];
+	char * str = "RTAS event";
+
+	if (full_rtas_msgs) {
+		printk(RTAS_DEBUG "%d -------- %s begin --------\n",
+		       error_log_cnt, str);
+
+		/*
+		 * Print perline bytes on each line, each line will start
+		 * with RTAS and a changing number, so syslogd will
+		 * print lines that are otherwise the same.  Separate every
+		 * 4 bytes with a space.
+		 */
+		for (i = 0; i < len; i++) {
+			j = i % perline;
+			if (j == 0) {
+				memset(buffer, 0, sizeof(buffer));
+				n = sprintf(buffer, "RTAS %d:", i/perline);
+			}
+
+			if ((i % 4) == 0)
+				n += sprintf(buffer+n, " ");
+
+			n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
+
+			if (j == (perline-1))
+				printk(KERN_DEBUG "%s\n", buffer);
+		}
+		if ((i % perline) != 0)
+			printk(KERN_DEBUG "%s\n", buffer);
+
+		printk(RTAS_DEBUG "%d -------- %s end ----------\n",
+		       error_log_cnt, str);
+	} else {
+		struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
+
+		printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
+		       error_log_cnt, rtas_event_type(rtas_error_type(errlog)),
+		       rtas_error_severity(errlog));
+	}
+}
+
+static int log_rtas_len(char * buf)
+{
+	int len;
+	struct rtas_error_log *err;
+	uint32_t extended_log_length;
+
+	/* rtas fixed header */
+	len = 8;
+	err = (struct rtas_error_log *)buf;
+	extended_log_length = rtas_error_extended_log_length(err);
+	if (rtas_error_extended(err) && extended_log_length) {
+
+		/* extended header */
+		len += extended_log_length;
+	}
+
+	if (rtas_error_log_max == 0)
+		rtas_error_log_max = rtas_get_error_log_max();
+
+	if (len > rtas_error_log_max)
+		len = rtas_error_log_max;
+
+	return len;
+}
+
+/*
+ * First write to nvram, if fatal error, that is the only
+ * place we log the info.  The error will be picked up
+ * on the next reboot by rtasd.  If not fatal, run the
+ * method for the type of error.  Currently, only RTAS
+ * errors have methods implemented, but in the future
+ * there might be a need to store data in nvram before a
+ * call to panic().
+ *
+ * XXX We write to nvram periodically, to indicate error has
+ * been written and sync'd, but there is a possibility
+ * that if we don't shutdown correctly, a duplicate error
+ * record will be created on next reboot.
+ */
+void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
+{
+	unsigned long offset;
+	unsigned long s;
+	int len = 0;
+
+	pr_debug("rtasd: logging event\n");
+	if (buf == NULL)
+		return;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+
+	/* get length and increase count */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		len = log_rtas_len(buf);
+		if (!(err_type & ERR_FLAG_BOOT))
+			error_log_cnt++;
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+#ifdef CONFIG_PPC64
+	/* Write error to NVRAM */
+	if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
+		nvram_write_error_log(buf, len, err_type, error_log_cnt);
+#endif /* CONFIG_PPC64 */
+
+	/*
+	 * rtas errors can occur during boot, and we do want to capture
+	 * those somewhere, even if nvram isn't ready (why not?), and even
+	 * if rtasd isn't ready. Put them into the boot log, at least.
+	 */
+	if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
+		printk_log_rtas(buf, len);
+
+	/* Check to see if we need to or have stopped logging */
+	if (fatal || !logging_enabled) {
+		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+	/* call type specific method for error */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		offset = rtas_error_log_buffer_max *
+			((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
+
+		/* First copy over sequence number */
+		memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
+
+		/* Second copy over error log data */
+		offset += sizeof(int);
+		memcpy(&rtas_log_buf[offset], buf, len);
+
+		if (rtas_log_size < LOG_NUMBER)
+			rtas_log_size += 1;
+		else
+			rtas_log_start += 1;
+
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		wake_up_interruptible(&rtas_log_wait);
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+}
+
+#ifdef CONFIG_PPC_PSERIES
+static void handle_prrn_event(s32 scope)
+{
+	/*
+	 * For PRRN, we must pass the negative of the scope value in
+	 * the RTAS event.
+	 */
+	pseries_devicetree_update(-scope);
+	numa_update_cpu_topology(false);
+}
+
+static void handle_rtas_event(const struct rtas_error_log *log)
+{
+	if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled())
+		return;
+
+	/* For PRRN Events the extended log length is used to denote
+	 * the scope for calling rtas update-nodes.
+	 */
+	handle_prrn_event(rtas_error_extended_log_length(log));
+}
+
+#else
+
+static void handle_rtas_event(const struct rtas_error_log *log)
+{
+	return;
+}
+
+#endif
+
+static int rtas_log_open(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+static int rtas_log_release(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+/* This will check if all events are logged, if they are then, we
+ * know that we can safely clear the events in NVRAM.
+ * Next we'll sit and wait for something else to log.
+ */
+static ssize_t rtas_log_read(struct file * file, char __user * buf,
+			 size_t count, loff_t *ppos)
+{
+	int error;
+	char *tmp;
+	unsigned long s;
+	unsigned long offset;
+
+	if (!buf || count < rtas_error_log_buffer_max)
+		return -EINVAL;
+
+	count = rtas_error_log_buffer_max;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+
+	/* if it's 0, then we know we got the last one (the one in NVRAM) */
+	while (rtas_log_size == 0) {
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock_irqrestore(&rtasd_log_lock, s);
+			error = -EAGAIN;
+			goto out;
+		}
+
+		if (!logging_enabled) {
+			spin_unlock_irqrestore(&rtasd_log_lock, s);
+			error = -ENODATA;
+			goto out;
+		}
+#ifdef CONFIG_PPC64
+		nvram_clear_error_log();
+#endif /* CONFIG_PPC64 */
+
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
+		if (error)
+			goto out;
+		spin_lock_irqsave(&rtasd_log_lock, s);
+	}
+
+	offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
+	memcpy(tmp, &rtas_log_buf[offset], count);
+
+	rtas_log_start += 1;
+	rtas_log_size -= 1;
+	spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+	error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
+out:
+	kfree(tmp);
+	return error;
+}
+
+static __poll_t rtas_log_poll(struct file *file, poll_table * wait)
+{
+	poll_wait(file, &rtas_log_wait, wait);
+	if (rtas_log_size)
+		return EPOLLIN | EPOLLRDNORM;
+	return 0;
+}
+
+static const struct file_operations proc_rtas_log_operations = {
+	.read =		rtas_log_read,
+	.poll =		rtas_log_poll,
+	.open =		rtas_log_open,
+	.release =	rtas_log_release,
+	.llseek =	noop_llseek,
+};
+
+static int enable_surveillance(int timeout)
+{
+	int error;
+
+	error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
+
+	if (error == 0)
+		return 0;
+
+	if (error == -EINVAL) {
+		printk(KERN_DEBUG "rtasd: surveillance not supported\n");
+		return 0;
+	}
+
+	printk(KERN_ERR "rtasd: could not update surveillance\n");
+	return -1;
+}
+
+static void do_event_scan(void)
+{
+	int error;
+	do {
+		memset(logdata, 0, rtas_error_log_max);
+		error = rtas_call(event_scan, 4, 1, NULL,
+				  RTAS_EVENT_SCAN_ALL_EVENTS, 0,
+				  __pa(logdata), rtas_error_log_max);
+		if (error == -1) {
+			printk(KERN_ERR "event-scan failed\n");
+			break;
+		}
+
+		if (error == 0) {
+			if (rtas_error_type((struct rtas_error_log *)logdata) !=
+			    RTAS_TYPE_PRRN)
+				pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG,
+						  0);
+			handle_rtas_event((struct rtas_error_log *)logdata);
+		}
+
+	} while(error == 0);
+}
+
+static void rtas_event_scan(struct work_struct *w);
+static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
+
+/*
+ * Delay should be at least one second since some machines have problems if
+ * we call event-scan too quickly.
+ */
+static unsigned long event_scan_delay = 1*HZ;
+static int first_pass = 1;
+
+static void rtas_event_scan(struct work_struct *w)
+{
+	unsigned int cpu;
+
+	do_event_scan();
+
+	get_online_cpus();
+
+	/* raw_ OK because just using CPU as starting point. */
+	cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+        if (cpu >= nr_cpu_ids) {
+		cpu = cpumask_first(cpu_online_mask);
+
+		if (first_pass) {
+			first_pass = 0;
+			event_scan_delay = 30*HZ/rtas_event_scan_rate;
+
+			if (surveillance_timeout != -1) {
+				pr_debug("rtasd: enabling surveillance\n");
+				enable_surveillance(surveillance_timeout);
+				pr_debug("rtasd: surveillance enabled\n");
+			}
+		}
+	}
+
+	schedule_delayed_work_on(cpu, &event_scan_work,
+		__round_jiffies_relative(event_scan_delay, cpu));
+
+	put_online_cpus();
+}
+
+#ifdef CONFIG_PPC64
+static void retrieve_nvram_error_log(void)
+{
+	unsigned int err_type ;
+	int rc ;
+
+	/* See if we have any error stored in NVRAM */
+	memset(logdata, 0, rtas_error_log_max);
+	rc = nvram_read_error_log(logdata, rtas_error_log_max,
+	                          &err_type, &error_log_cnt);
+	/* We can use rtas_log_buf now */
+	logging_enabled = 1;
+	if (!rc) {
+		if (err_type != ERR_FLAG_ALREADY_LOGGED) {
+			pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
+		}
+	}
+}
+#else /* CONFIG_PPC64 */
+static void retrieve_nvram_error_log(void)
+{
+}
+#endif /* CONFIG_PPC64 */
+
+static void start_event_scan(void)
+{
+	printk(KERN_DEBUG "RTAS daemon started\n");
+	pr_debug("rtasd: will sleep for %d milliseconds\n",
+		 (30000 / rtas_event_scan_rate));
+
+	/* Retrieve errors from nvram if any */
+	retrieve_nvram_error_log();
+
+	schedule_delayed_work_on(cpumask_first(cpu_online_mask),
+				 &event_scan_work, event_scan_delay);
+}
+
+/* Cancel the rtas event scan work */
+void rtas_cancel_event_scan(void)
+{
+	cancel_delayed_work_sync(&event_scan_work);
+}
+EXPORT_SYMBOL_GPL(rtas_cancel_event_scan);
+
+static int __init rtas_event_scan_init(void)
+{
+	if (!machine_is(pseries) && !machine_is(chrp))
+		return 0;
+
+	/* No RTAS */
+	event_scan = rtas_token("event-scan");
+	if (event_scan == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_INFO "rtasd: No event-scan on system\n");
+		return -ENODEV;
+	}
+
+	rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
+	if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
+		return -ENODEV;
+	}
+
+	if (!rtas_event_scan_rate) {
+		/* Broken firmware: take a rate of zero to mean don't scan */
+		printk(KERN_DEBUG "rtasd: scan rate is 0, not scanning\n");
+		return 0;
+	}
+
+	/* Make room for the sequence number */
+	rtas_error_log_max = rtas_get_error_log_max();
+	rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
+
+	rtas_log_buf = vmalloc(array_size(LOG_NUMBER,
+					  rtas_error_log_buffer_max));
+	if (!rtas_log_buf) {
+		printk(KERN_ERR "rtasd: no memory\n");
+		return -ENOMEM;
+	}
+
+	start_event_scan();
+
+	return 0;
+}
+arch_initcall(rtas_event_scan_init);
+
+static int __init rtas_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	if (!machine_is(pseries) && !machine_is(chrp))
+		return 0;
+
+	if (!rtas_log_buf)
+		return -ENODEV;
+
+	entry = proc_create("powerpc/rtas/error_log", 0400, NULL,
+			    &proc_rtas_log_operations);
+	if (!entry)
+		printk(KERN_ERR "Failed to create error_log proc entry\n");
+
+	return 0;
+}
+__initcall(rtas_init);
+
+static int __init surveillance_setup(char *str)
+{
+	int i;
+
+	/* We only do surveillance on pseries */
+	if (!machine_is(pseries))
+		return 0;
+
+	if (get_option(&str,&i)) {
+		if (i >= 0 && i <= 255)
+			surveillance_timeout = i;
+	}
+
+	return 1;
+}
+__setup("surveillance=", surveillance_setup);
+
+static int __init rtasmsgs_setup(char *str)
+{
+	return (kstrtobool(str, &full_rtas_msgs) == 0);
+}
+__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
new file mode 100644
index 000000000..48985a1fd
--- /dev/null
+++ b/arch/powerpc/kernel/security.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Security related flags and so on.
+//
+// Copyright 2018, Michael Ellerman, IBM Corporation.
+
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/seq_buf.h>
+
+#include <asm/asm-prototypes.h>
+#include <asm/code-patching.h>
+#include <asm/debugfs.h>
+#include <asm/security_features.h>
+#include <asm/setup.h>
+
+
+unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT;
+
+enum count_cache_flush_type {
+	COUNT_CACHE_FLUSH_NONE	= 0x1,
+	COUNT_CACHE_FLUSH_SW	= 0x2,
+	COUNT_CACHE_FLUSH_HW	= 0x4,
+};
+static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NONE;
+static bool link_stack_flush_enabled;
+
+bool barrier_nospec_enabled;
+static bool no_nospec;
+static bool btb_flush_enabled;
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64)
+static bool no_spectrev2;
+#endif
+
+static void enable_barrier_nospec(bool enable)
+{
+	barrier_nospec_enabled = enable;
+	do_barrier_nospec_fixups(enable);
+}
+
+void setup_barrier_nospec(void)
+{
+	bool enable;
+
+	/*
+	 * It would make sense to check SEC_FTR_SPEC_BAR_ORI31 below as well.
+	 * But there's a good reason not to. The two flags we check below are
+	 * both are enabled by default in the kernel, so if the hcall is not
+	 * functional they will be enabled.
+	 * On a system where the host firmware has been updated (so the ori
+	 * functions as a barrier), but on which the hypervisor (KVM/Qemu) has
+	 * not been updated, we would like to enable the barrier. Dropping the
+	 * check for SEC_FTR_SPEC_BAR_ORI31 achieves that. The only downside is
+	 * we potentially enable the barrier on systems where the host firmware
+	 * is not updated, but that's harmless as it's a no-op.
+	 */
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR);
+
+	if (!no_nospec && !cpu_mitigations_off())
+		enable_barrier_nospec(enable);
+}
+
+static int __init handle_nospectre_v1(char *p)
+{
+	no_nospec = true;
+
+	return 0;
+}
+early_param("nospectre_v1", handle_nospectre_v1);
+
+#ifdef CONFIG_DEBUG_FS
+static int barrier_nospec_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+	case 1:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!!val == !!barrier_nospec_enabled)
+		return 0;
+
+	enable_barrier_nospec(!!val);
+
+	return 0;
+}
+
+static int barrier_nospec_get(void *data, u64 *val)
+{
+	*val = barrier_nospec_enabled ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_barrier_nospec,
+			barrier_nospec_get, barrier_nospec_set, "%llu\n");
+
+static __init int barrier_nospec_debugfs_init(void)
+{
+	debugfs_create_file("barrier_nospec", 0600, powerpc_debugfs_root, NULL,
+			    &fops_barrier_nospec);
+	return 0;
+}
+device_initcall(barrier_nospec_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64)
+static int __init handle_nospectre_v2(char *p)
+{
+	no_spectrev2 = true;
+
+	return 0;
+}
+early_param("nospectre_v2", handle_nospectre_v2);
+#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+void setup_spectre_v2(void)
+{
+	if (no_spectrev2 || cpu_mitigations_off())
+		do_btb_flush_fixups();
+	else
+		btb_flush_enabled = true;
+}
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	bool thread_priv;
+
+	thread_priv = security_ftr_enabled(SEC_FTR_L1D_THREAD_PRIV);
+
+	if (rfi_flush) {
+		struct seq_buf s;
+		seq_buf_init(&s, buf, PAGE_SIZE - 1);
+
+		seq_buf_printf(&s, "Mitigation: RFI Flush");
+		if (thread_priv)
+			seq_buf_printf(&s, ", L1D private per thread");
+
+		seq_buf_printf(&s, "\n");
+
+		return s.len;
+	}
+
+	if (thread_priv)
+		return sprintf(buf, "Vulnerable: L1D private per thread\n");
+
+	if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
+	    !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR))
+		return sprintf(buf, "Not affected\n");
+
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_meltdown(dev, attr, buf);
+}
+#endif
+
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct seq_buf s;
+
+	seq_buf_init(&s, buf, PAGE_SIZE - 1);
+
+	if (security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR)) {
+		if (barrier_nospec_enabled)
+			seq_buf_printf(&s, "Mitigation: __user pointer sanitization");
+		else
+			seq_buf_printf(&s, "Vulnerable");
+
+		if (security_ftr_enabled(SEC_FTR_SPEC_BAR_ORI31))
+			seq_buf_printf(&s, ", ori31 speculation barrier enabled");
+
+		seq_buf_printf(&s, "\n");
+	} else
+		seq_buf_printf(&s, "Not affected\n");
+
+	return s.len;
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct seq_buf s;
+	bool bcs, ccd;
+
+	seq_buf_init(&s, buf, PAGE_SIZE - 1);
+
+	bcs = security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED);
+	ccd = security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED);
+
+	if (bcs || ccd) {
+		seq_buf_printf(&s, "Mitigation: ");
+
+		if (bcs)
+			seq_buf_printf(&s, "Indirect branch serialisation (kernel only)");
+
+		if (bcs && ccd)
+			seq_buf_printf(&s, ", ");
+
+		if (ccd)
+			seq_buf_printf(&s, "Indirect branch cache disabled");
+
+		if (link_stack_flush_enabled)
+			seq_buf_printf(&s, ", Software link stack flush");
+
+	} else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) {
+		seq_buf_printf(&s, "Mitigation: Software count cache flush");
+
+		if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW)
+			seq_buf_printf(&s, " (hardware accelerated)");
+
+		if (link_stack_flush_enabled)
+			seq_buf_printf(&s, ", Software link stack flush");
+
+	} else if (btb_flush_enabled) {
+		seq_buf_printf(&s, "Mitigation: Branch predictor state flush");
+	} else {
+		seq_buf_printf(&s, "Vulnerable");
+	}
+
+	seq_buf_printf(&s, "\n");
+
+	return s.len;
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * Store-forwarding barrier support.
+ */
+
+static enum stf_barrier_type stf_enabled_flush_types;
+static bool no_stf_barrier;
+bool stf_barrier;
+
+static int __init handle_no_stf_barrier(char *p)
+{
+	pr_info("stf-barrier: disabled on command line.");
+	no_stf_barrier = true;
+	return 0;
+}
+
+early_param("no_stf_barrier", handle_no_stf_barrier);
+
+enum stf_barrier_type stf_barrier_type_get(void)
+{
+	return stf_enabled_flush_types;
+}
+
+/* This is the generic flag used by other architectures */
+static int __init handle_ssbd(char *p)
+{
+	if (!p || strncmp(p, "auto", 5) == 0 || strncmp(p, "on", 2) == 0 ) {
+		/* Until firmware tells us, we have the barrier with auto */
+		return 0;
+	} else if (strncmp(p, "off", 3) == 0) {
+		handle_no_stf_barrier(NULL);
+		return 0;
+	} else
+		return 1;
+
+	return 0;
+}
+early_param("spec_store_bypass_disable", handle_ssbd);
+
+/* This is the generic flag used by other architectures */
+static int __init handle_no_ssbd(char *p)
+{
+	handle_no_stf_barrier(NULL);
+	return 0;
+}
+early_param("nospec_store_bypass_disable", handle_no_ssbd);
+
+static void stf_barrier_enable(bool enable)
+{
+	if (enable)
+		do_stf_barrier_fixups(stf_enabled_flush_types);
+	else
+		do_stf_barrier_fixups(STF_BARRIER_NONE);
+
+	stf_barrier = enable;
+}
+
+void setup_stf_barrier(void)
+{
+	enum stf_barrier_type type;
+	bool enable, hv;
+
+	hv = cpu_has_feature(CPU_FTR_HVMODE);
+
+	/* Default to fallback in case fw-features are not available */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		type = STF_BARRIER_EIEIO;
+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		type = STF_BARRIER_SYNC_ORI;
+	else if (cpu_has_feature(CPU_FTR_ARCH_206))
+		type = STF_BARRIER_FALLBACK;
+	else
+		type = STF_BARRIER_NONE;
+
+	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		(security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) ||
+		 (security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && hv));
+
+	if (type == STF_BARRIER_FALLBACK) {
+		pr_info("stf-barrier: fallback barrier available\n");
+	} else if (type == STF_BARRIER_SYNC_ORI) {
+		pr_info("stf-barrier: hwsync barrier available\n");
+	} else if (type == STF_BARRIER_EIEIO) {
+		pr_info("stf-barrier: eieio barrier available\n");
+	}
+
+	stf_enabled_flush_types = type;
+
+	if (!no_stf_barrier && !cpu_mitigations_off())
+		stf_barrier_enable(enable);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	if (stf_barrier && stf_enabled_flush_types != STF_BARRIER_NONE) {
+		const char *type;
+		switch (stf_enabled_flush_types) {
+		case STF_BARRIER_EIEIO:
+			type = "eieio";
+			break;
+		case STF_BARRIER_SYNC_ORI:
+			type = "hwsync";
+			break;
+		case STF_BARRIER_FALLBACK:
+			type = "fallback";
+			break;
+		default:
+			type = "unknown";
+		}
+		return sprintf(buf, "Mitigation: Kernel entry/exit barrier (%s)\n", type);
+	}
+
+	if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
+	    !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR))
+		return sprintf(buf, "Not affected\n");
+
+	return sprintf(buf, "Vulnerable\n");
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int stf_barrier_set(void *data, u64 val)
+{
+	bool enable;
+
+	if (val == 1)
+		enable = true;
+	else if (val == 0)
+		enable = false;
+	else
+		return -EINVAL;
+
+	/* Only do anything if we're changing state */
+	if (enable != stf_barrier)
+		stf_barrier_enable(enable);
+
+	return 0;
+}
+
+static int stf_barrier_get(void *data, u64 *val)
+{
+	*val = stf_barrier ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, "%llu\n");
+
+static __init int stf_barrier_debugfs_init(void)
+{
+	debugfs_create_file("stf_barrier", 0600, powerpc_debugfs_root, NULL, &fops_stf_barrier);
+	return 0;
+}
+device_initcall(stf_barrier_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+
+static void no_count_cache_flush(void)
+{
+	count_cache_flush_type = COUNT_CACHE_FLUSH_NONE;
+	pr_info("count-cache-flush: software flush disabled.\n");
+}
+
+static void toggle_count_cache_flush(bool enable)
+{
+	if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE) &&
+	    !security_ftr_enabled(SEC_FTR_FLUSH_LINK_STACK))
+		enable = false;
+
+	if (!enable) {
+		patch_instruction_site(&patch__call_flush_count_cache, PPC_INST_NOP);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		patch_instruction_site(&patch__call_kvm_flush_link_stack, PPC_INST_NOP);
+#endif
+		pr_info("link-stack-flush: software flush disabled.\n");
+		link_stack_flush_enabled = false;
+		no_count_cache_flush();
+		return;
+	}
+
+	// This enables the branch from _switch to flush_count_cache
+	patch_branch_site(&patch__call_flush_count_cache,
+			  (u64)&flush_count_cache, BRANCH_SET_LINK);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	// This enables the branch from guest_exit_cont to kvm_flush_link_stack
+	patch_branch_site(&patch__call_kvm_flush_link_stack,
+			  (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
+#endif
+
+	pr_info("link-stack-flush: software flush enabled.\n");
+	link_stack_flush_enabled = true;
+
+	// If we just need to flush the link stack, patch an early return
+	if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) {
+		patch_instruction_site(&patch__flush_link_stack_return, PPC_INST_BLR);
+		no_count_cache_flush();
+		return;
+	}
+
+	if (!security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) {
+		count_cache_flush_type = COUNT_CACHE_FLUSH_SW;
+		pr_info("count-cache-flush: full software flush sequence enabled.\n");
+		return;
+	}
+
+	patch_instruction_site(&patch__flush_count_cache_return, PPC_INST_BLR);
+	count_cache_flush_type = COUNT_CACHE_FLUSH_HW;
+	pr_info("count-cache-flush: hardware assisted flush sequence enabled\n");
+}
+
+void setup_count_cache_flush(void)
+{
+	bool enable = true;
+
+	if (no_spectrev2 || cpu_mitigations_off()) {
+		if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) ||
+		    security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED))
+			pr_warn("Spectre v2 mitigations not fully under software control, can't disable\n");
+
+		enable = false;
+	}
+
+	/*
+	 * There's no firmware feature flag/hypervisor bit to tell us we need to
+	 * flush the link stack on context switch. So we set it here if we see
+	 * either of the Spectre v2 mitigations that aim to protect userspace.
+	 */
+	if (security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED) ||
+	    security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE))
+		security_ftr_set(SEC_FTR_FLUSH_LINK_STACK);
+
+	toggle_count_cache_flush(enable);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int count_cache_flush_set(void *data, u64 val)
+{
+	bool enable;
+
+	if (val == 1)
+		enable = true;
+	else if (val == 0)
+		enable = false;
+	else
+		return -EINVAL;
+
+	toggle_count_cache_flush(enable);
+
+	return 0;
+}
+
+static int count_cache_flush_get(void *data, u64 *val)
+{
+	if (count_cache_flush_type == COUNT_CACHE_FLUSH_NONE)
+		*val = 0;
+	else
+		*val = 1;
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get,
+			count_cache_flush_set, "%llu\n");
+
+static __init int count_cache_flush_debugfs_init(void)
+{
+	debugfs_create_file("count_cache_flush", 0600, powerpc_debugfs_root,
+			    NULL, &fops_count_cache_flush);
+	return 0;
+}
+device_initcall(count_cache_flush_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
new file mode 100644
index 000000000..7787a26d4
--- /dev/null
+++ b/arch/powerpc/kernel/setup-common.c
@@ -0,0 +1,989 @@
+/*
+ * Common boot and setup code for both 32-bit and 64-bit.
+ * Extracted from arch/powerpc/kernel/setup_64.c.
+ *
+ * Copyright (C) 2001 PPC64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/platform_device.h>
+#include <linux/seq_file.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/root_dev.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/unistd.h>
+#include <linux/serial.h>
+#include <linux/serial_8250.h>
+#include <linux/percpu.h>
+#include <linux/memblock.h>
+#include <linux/of_platform.h>
+#include <linux/hugetlb.h>
+#include <asm/debugfs.h>
+#include <asm/io.h>
+#include <asm/paca.h>
+#include <asm/prom.h>
+#include <asm/processor.h>
+#include <asm/vdso_datapage.h>
+#include <asm/pgtable.h>
+#include <asm/smp.h>
+#include <asm/elf.h>
+#include <asm/machdep.h>
+#include <asm/time.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/firmware.h>
+#include <asm/btext.h>
+#include <asm/nvram.h>
+#include <asm/setup.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/serial.h>
+#include <asm/cache.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/xmon.h>
+#include <asm/cputhreads.h>
+#include <mm/mmu_decl.h>
+#include <asm/fadump.h>
+#include <asm/udbg.h>
+#include <asm/hugetlb.h>
+#include <asm/livepatch.h>
+#include <asm/mmu_context.h>
+#include <asm/cpu_has_feature.h>
+
+#include "setup.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/* The main machine-dep calls structure
+ */
+struct machdep_calls ppc_md;
+EXPORT_SYMBOL(ppc_md);
+struct machdep_calls *machine_id;
+EXPORT_SYMBOL(machine_id);
+
+int boot_cpuid = -1;
+EXPORT_SYMBOL_GPL(boot_cpuid);
+
+/*
+ * These are used in binfmt_elf.c to put aux entries on the stack
+ * for each elf executable being started.
+ */
+int dcache_bsize;
+int icache_bsize;
+int ucache_bsize;
+
+
+unsigned long klimit = (unsigned long) _end;
+
+/*
+ * This still seems to be needed... -- paulus
+ */ 
+struct screen_info screen_info = {
+	.orig_x = 0,
+	.orig_y = 25,
+	.orig_video_cols = 80,
+	.orig_video_lines = 25,
+	.orig_video_isVGA = 1,
+	.orig_video_points = 16
+};
+#if defined(CONFIG_FB_VGA16_MODULE)
+EXPORT_SYMBOL(screen_info);
+#endif
+
+/* Variables required to store legacy IO irq routing */
+int of_i8042_kbd_irq;
+EXPORT_SYMBOL_GPL(of_i8042_kbd_irq);
+int of_i8042_aux_irq;
+EXPORT_SYMBOL_GPL(of_i8042_aux_irq);
+
+#ifdef __DO_IRQ_CANON
+/* XXX should go elsewhere eventually */
+int ppc_do_canonicalize_irqs;
+EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
+#endif
+
+#ifdef CONFIG_CRASH_CORE
+/* This keeps a track of which one is the crashing cpu. */
+int crashing_cpu = -1;
+#endif
+
+/* also used by kexec */
+void machine_shutdown(void)
+{
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * if fadump is active, cleanup the fadump registration before we
+	 * shutdown.
+	 */
+	fadump_cleanup();
+#endif
+
+	if (ppc_md.machine_shutdown)
+		ppc_md.machine_shutdown();
+}
+
+static void machine_hang(void)
+{
+	pr_emerg("System Halted, OK to turn off power\n");
+	local_irq_disable();
+	while (1)
+		;
+}
+
+void machine_restart(char *cmd)
+{
+	machine_shutdown();
+	if (ppc_md.restart)
+		ppc_md.restart(cmd);
+
+	smp_send_stop();
+
+	do_kernel_restart(cmd);
+	mdelay(1000);
+
+	machine_hang();
+}
+
+void machine_power_off(void)
+{
+	machine_shutdown();
+	if (pm_power_off)
+		pm_power_off();
+
+	smp_send_stop();
+	machine_hang();
+}
+/* Used by the G5 thermal driver */
+EXPORT_SYMBOL_GPL(machine_power_off);
+
+void (*pm_power_off)(void);
+EXPORT_SYMBOL_GPL(pm_power_off);
+
+void machine_halt(void)
+{
+	machine_shutdown();
+	if (ppc_md.halt)
+		ppc_md.halt();
+
+	smp_send_stop();
+	machine_hang();
+}
+
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(unsigned int, cpu_pvr);
+#endif
+
+static void show_cpuinfo_summary(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = NULL;
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC32)
+	unsigned long bogosum = 0;
+	int i;
+	for_each_online_cpu(i)
+		bogosum += loops_per_jiffy;
+	seq_printf(m, "total bogomips\t: %lu.%02lu\n",
+		   bogosum/(500000/HZ), bogosum/(5000/HZ) % 100);
+#endif /* CONFIG_SMP && CONFIG_PPC32 */
+	seq_printf(m, "timebase\t: %lu\n", ppc_tb_freq);
+	if (ppc_md.name)
+		seq_printf(m, "platform\t: %s\n", ppc_md.name);
+	root = of_find_node_by_path("/");
+	if (root)
+		model = of_get_property(root, "model", NULL);
+	if (model)
+		seq_printf(m, "model\t\t: %s\n", model);
+	of_node_put(root);
+
+	if (ppc_md.show_cpuinfo != NULL)
+		ppc_md.show_cpuinfo(m);
+
+#ifdef CONFIG_PPC32
+	/* Display the amount of memory */
+	seq_printf(m, "Memory\t\t: %d MB\n",
+		   (unsigned int)(total_memory / (1024 * 1024)));
+#endif
+}
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+	unsigned long cpu_id = (unsigned long)v - 1;
+	unsigned int pvr;
+	unsigned long proc_freq;
+	unsigned short maj;
+	unsigned short min;
+
+#ifdef CONFIG_SMP
+	pvr = per_cpu(cpu_pvr, cpu_id);
+#else
+	pvr = mfspr(SPRN_PVR);
+#endif
+	maj = (pvr >> 8) & 0xFF;
+	min = pvr & 0xFF;
+
+	seq_printf(m, "processor\t: %lu\n", cpu_id);
+	seq_printf(m, "cpu\t\t: ");
+
+	if (cur_cpu_spec->pvr_mask && cur_cpu_spec->cpu_name)
+		seq_printf(m, "%s", cur_cpu_spec->cpu_name);
+	else
+		seq_printf(m, "unknown (%08x)", pvr);
+
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		seq_printf(m, ", altivec supported");
+#endif /* CONFIG_ALTIVEC */
+
+	seq_printf(m, "\n");
+
+#ifdef CONFIG_TAU
+	if (cur_cpu_spec->cpu_features & CPU_FTR_TAU) {
+#ifdef CONFIG_TAU_AVERAGE
+		/* more straightforward, but potentially misleading */
+		seq_printf(m,  "temperature \t: %u C (uncalibrated)\n",
+			   cpu_temp(cpu_id));
+#else
+		/* show the actual temp sensor range */
+		u32 temp;
+		temp = cpu_temp_both(cpu_id);
+		seq_printf(m, "temperature \t: %u-%u C (uncalibrated)\n",
+			   temp & 0xff, temp >> 16);
+#endif
+	}
+#endif /* CONFIG_TAU */
+
+	/*
+	 * Platforms that have variable clock rates, should implement
+	 * the method ppc_md.get_proc_freq() that reports the clock
+	 * rate of a given cpu. The rest can use ppc_proc_freq to
+	 * report the clock rate that is same across all cpus.
+	 */
+	if (ppc_md.get_proc_freq)
+		proc_freq = ppc_md.get_proc_freq(cpu_id);
+	else
+		proc_freq = ppc_proc_freq;
+
+	if (proc_freq)
+		seq_printf(m, "clock\t\t: %lu.%06luMHz\n",
+			   proc_freq / 1000000, proc_freq % 1000000);
+
+	if (ppc_md.show_percpuinfo != NULL)
+		ppc_md.show_percpuinfo(m, cpu_id);
+
+	/* If we are a Freescale core do a simple check so
+	 * we dont have to keep adding cases in the future */
+	if (PVR_VER(pvr) & 0x8000) {
+		switch (PVR_VER(pvr)) {
+		case 0x8000:	/* 7441/7450/7451, Voyager */
+		case 0x8001:	/* 7445/7455, Apollo 6 */
+		case 0x8002:	/* 7447/7457, Apollo 7 */
+		case 0x8003:	/* 7447A, Apollo 7 PM */
+		case 0x8004:	/* 7448, Apollo 8 */
+		case 0x800c:	/* 7410, Nitro */
+			maj = ((pvr >> 8) & 0xF);
+			min = PVR_MIN(pvr);
+			break;
+		default:	/* e500/book-e */
+			maj = PVR_MAJ(pvr);
+			min = PVR_MIN(pvr);
+			break;
+		}
+	} else {
+		switch (PVR_VER(pvr)) {
+			case 0x0020:	/* 403 family */
+				maj = PVR_MAJ(pvr) + 1;
+				min = PVR_MIN(pvr);
+				break;
+			case 0x1008:	/* 740P/750P ?? */
+				maj = ((pvr >> 8) & 0xFF) - 1;
+				min = pvr & 0xFF;
+				break;
+			case 0x004e: /* POWER9 bits 12-15 give chip type */
+				maj = (pvr >> 8) & 0x0F;
+				min = pvr & 0xFF;
+				break;
+			default:
+				maj = (pvr >> 8) & 0xFF;
+				min = pvr & 0xFF;
+				break;
+		}
+	}
+
+	seq_printf(m, "revision\t: %hd.%hd (pvr %04x %04x)\n",
+		   maj, min, PVR_VER(pvr), PVR_REV(pvr));
+
+#ifdef CONFIG_PPC32
+	seq_printf(m, "bogomips\t: %lu.%02lu\n",
+		   loops_per_jiffy / (500000/HZ),
+		   (loops_per_jiffy / (5000/HZ)) % 100);
+#endif
+	seq_printf(m, "\n");
+
+	/* If this is the last cpu, print the summary */
+	if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids)
+		show_cpuinfo_summary(m);
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)	/* just in case, cpu 0 is not the first */
+		*pos = cpumask_first(cpu_online_mask);
+	else
+		*pos = cpumask_next(*pos - 1, cpu_online_mask);
+	if ((*pos) < nr_cpu_ids)
+		return (void *)(unsigned long)(*pos + 1);
+	return NULL;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return c_start(m, pos);
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+const struct seq_operations cpuinfo_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_cpuinfo,
+};
+
+void __init check_for_initrd(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	DBG(" -> check_for_initrd()  initrd_start=0x%lx  initrd_end=0x%lx\n",
+	    initrd_start, initrd_end);
+
+	/* If we were passed an initrd, set the ROOT_DEV properly if the values
+	 * look sensible. If not, clear initrd reference.
+	 */
+	if (is_kernel_addr(initrd_start) && is_kernel_addr(initrd_end) &&
+	    initrd_end > initrd_start)
+		ROOT_DEV = Root_RAM0;
+	else
+		initrd_start = initrd_end = 0;
+
+	if (initrd_start)
+		pr_info("Found initrd at 0x%lx:0x%lx\n", initrd_start, initrd_end);
+
+	DBG(" <- check_for_initrd()\n");
+#endif /* CONFIG_BLK_DEV_INITRD */
+}
+
+#ifdef CONFIG_SMP
+
+int threads_per_core, threads_per_subcore, threads_shift;
+cpumask_t threads_core_mask;
+EXPORT_SYMBOL_GPL(threads_per_core);
+EXPORT_SYMBOL_GPL(threads_per_subcore);
+EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(threads_core_mask);
+
+static void __init cpu_init_thread_core_maps(int tpc)
+{
+	int i;
+
+	threads_per_core = tpc;
+	threads_per_subcore = tpc;
+	cpumask_clear(&threads_core_mask);
+
+	/* This implementation only supports power of 2 number of threads
+	 * for simplicity and performance
+	 */
+	threads_shift = ilog2(tpc);
+	BUG_ON(tpc != (1 << threads_shift));
+
+	for (i = 0; i < tpc; i++)
+		cpumask_set_cpu(i, &threads_core_mask);
+
+	printk(KERN_INFO "CPU maps initialized for %d thread%s per core\n",
+	       tpc, tpc > 1 ? "s" : "");
+	printk(KERN_DEBUG " (thread shift is %d)\n", threads_shift);
+}
+
+
+u32 *cpu_to_phys_id = NULL;
+
+/**
+ * setup_cpu_maps - initialize the following cpu maps:
+ *                  cpu_possible_mask
+ *                  cpu_present_mask
+ *
+ * Having the possible map set up early allows us to restrict allocations
+ * of things like irqstacks to nr_cpu_ids rather than NR_CPUS.
+ *
+ * We do not initialize the online map here; cpus set their own bits in
+ * cpu_online_mask as they come up.
+ *
+ * This function is valid only for Open Firmware systems.  finish_device_tree
+ * must be called before using this.
+ *
+ * While we're here, we may as well set the "physical" cpu ids in the paca.
+ *
+ * NOTE: This must match the parsing done in early_init_dt_scan_cpus.
+ */
+void __init smp_setup_cpu_maps(void)
+{
+	struct device_node *dn;
+	int cpu = 0;
+	int nthreads = 1;
+
+	DBG("smp_setup_cpu_maps()\n");
+
+	cpu_to_phys_id = __va(memblock_alloc(nr_cpu_ids * sizeof(u32),
+							__alignof__(u32)));
+	memset(cpu_to_phys_id, 0, nr_cpu_ids * sizeof(u32));
+
+	for_each_node_by_type(dn, "cpu") {
+		const __be32 *intserv;
+		__be32 cpu_be;
+		int j, len;
+
+		DBG("  * %pOF...\n", dn);
+
+		intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s",
+				&len);
+		if (intserv) {
+			DBG("    ibm,ppc-interrupt-server#s -> %d threads\n",
+			    nthreads);
+		} else {
+			DBG("    no ibm,ppc-interrupt-server#s -> 1 thread\n");
+			intserv = of_get_property(dn, "reg", &len);
+			if (!intserv) {
+				cpu_be = cpu_to_be32(cpu);
+				/* XXX: what is this? uninitialized?? */
+				intserv = &cpu_be;	/* assume logical == phys */
+				len = 4;
+			}
+		}
+
+		nthreads = len / sizeof(int);
+
+		for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) {
+			bool avail;
+
+			DBG("    thread %d -> cpu %d (hard id %d)\n",
+			    j, cpu, be32_to_cpu(intserv[j]));
+
+			avail = of_device_is_available(dn);
+			if (!avail)
+				avail = !of_property_match_string(dn,
+						"enable-method", "spin-table");
+
+			set_cpu_present(cpu, avail);
+			set_cpu_possible(cpu, true);
+			cpu_to_phys_id[cpu] = be32_to_cpu(intserv[j]);
+			cpu++;
+		}
+
+		if (cpu >= nr_cpu_ids) {
+			of_node_put(dn);
+			break;
+		}
+	}
+
+	/* If no SMT supported, nthreads is forced to 1 */
+	if (!cpu_has_feature(CPU_FTR_SMT)) {
+		DBG("  SMT disabled ! nthreads forced to 1\n");
+		nthreads = 1;
+	}
+
+#ifdef CONFIG_PPC64
+	/*
+	 * On pSeries LPAR, we need to know how many cpus
+	 * could possibly be added to this partition.
+	 */
+	if (firmware_has_feature(FW_FEATURE_LPAR) &&
+	    (dn = of_find_node_by_path("/rtas"))) {
+		int num_addr_cell, num_size_cell, maxcpus;
+		const __be32 *ireg;
+
+		num_addr_cell = of_n_addr_cells(dn);
+		num_size_cell = of_n_size_cells(dn);
+
+		ireg = of_get_property(dn, "ibm,lrdr-capacity", NULL);
+
+		if (!ireg)
+			goto out;
+
+		maxcpus = be32_to_cpup(ireg + num_addr_cell + num_size_cell);
+
+		/* Double maxcpus for processors which have SMT capability */
+		if (cpu_has_feature(CPU_FTR_SMT))
+			maxcpus *= nthreads;
+
+		if (maxcpus > nr_cpu_ids) {
+			printk(KERN_WARNING
+			       "Partition configured for %d cpus, "
+			       "operating system maximum is %u.\n",
+			       maxcpus, nr_cpu_ids);
+			maxcpus = nr_cpu_ids;
+		} else
+			printk(KERN_INFO "Partition configured for %d cpus.\n",
+			       maxcpus);
+
+		for (cpu = 0; cpu < maxcpus; cpu++)
+			set_cpu_possible(cpu, true);
+	out:
+		of_node_put(dn);
+	}
+	vdso_data->processorCount = num_present_cpus();
+#endif /* CONFIG_PPC64 */
+
+        /* Initialize CPU <=> thread mapping/
+	 *
+	 * WARNING: We assume that the number of threads is the same for
+	 * every CPU in the system. If that is not the case, then some code
+	 * here will have to be reworked
+	 */
+	cpu_init_thread_core_maps(nthreads);
+
+	/* Now that possible cpus are set, set nr_cpu_ids for later use */
+	setup_nr_cpu_ids();
+
+	free_unused_pacas();
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PCSPKR_PLATFORM
+static __init int add_pcspkr(void)
+{
+	struct device_node *np;
+	struct platform_device *pd;
+	int ret;
+
+	np = of_find_compatible_node(NULL, NULL, "pnpPNP,100");
+	of_node_put(np);
+	if (!np)
+		return -ENODEV;
+
+	pd = platform_device_alloc("pcspkr", -1);
+	if (!pd)
+		return -ENOMEM;
+
+	ret = platform_device_add(pd);
+	if (ret)
+		platform_device_put(pd);
+
+	return ret;
+}
+device_initcall(add_pcspkr);
+#endif	/* CONFIG_PCSPKR_PLATFORM */
+
+void probe_machine(void)
+{
+	extern struct machdep_calls __machine_desc_start;
+	extern struct machdep_calls __machine_desc_end;
+	unsigned int i;
+
+	/*
+	 * Iterate all ppc_md structures until we find the proper
+	 * one for the current machine type
+	 */
+	DBG("Probing machine type ...\n");
+
+	/*
+	 * Check ppc_md is empty, if not we have a bug, ie, we setup an
+	 * entry before probe_machine() which will be overwritten
+	 */
+	for (i = 0; i < (sizeof(ppc_md) / sizeof(void *)); i++) {
+		if (((void **)&ppc_md)[i]) {
+			printk(KERN_ERR "Entry %d in ppc_md non empty before"
+			       " machine probe !\n", i);
+		}
+	}
+
+	for (machine_id = &__machine_desc_start;
+	     machine_id < &__machine_desc_end;
+	     machine_id++) {
+		DBG("  %s ...", machine_id->name);
+		memcpy(&ppc_md, machine_id, sizeof(struct machdep_calls));
+		if (ppc_md.probe()) {
+			DBG(" match !\n");
+			break;
+		}
+		DBG("\n");
+	}
+	/* What can we do if we didn't find ? */
+	if (machine_id >= &__machine_desc_end) {
+		DBG("No suitable machine found !\n");
+		for (;;);
+	}
+
+	printk(KERN_INFO "Using %s machine description\n", ppc_md.name);
+}
+
+/* Match a class of boards, not a specific device configuration. */
+int check_legacy_ioport(unsigned long base_port)
+{
+	struct device_node *parent, *np = NULL;
+	int ret = -ENODEV;
+
+	switch(base_port) {
+	case I8042_DATA_REG:
+		if (!(np = of_find_compatible_node(NULL, NULL, "pnpPNP,303")))
+			np = of_find_compatible_node(NULL, NULL, "pnpPNP,f03");
+		if (np) {
+			parent = of_get_parent(np);
+
+			of_i8042_kbd_irq = irq_of_parse_and_map(parent, 0);
+			if (!of_i8042_kbd_irq)
+				of_i8042_kbd_irq = 1;
+
+			of_i8042_aux_irq = irq_of_parse_and_map(parent, 1);
+			if (!of_i8042_aux_irq)
+				of_i8042_aux_irq = 12;
+
+			of_node_put(np);
+			np = parent;
+			break;
+		}
+		np = of_find_node_by_type(NULL, "8042");
+		/* Pegasos has no device_type on its 8042 node, look for the
+		 * name instead */
+		if (!np)
+			np = of_find_node_by_name(NULL, "8042");
+		if (np) {
+			of_i8042_kbd_irq = 1;
+			of_i8042_aux_irq = 12;
+		}
+		break;
+	case FDC_BASE: /* FDC1 */
+		np = of_find_node_by_type(NULL, "fdc");
+		break;
+	default:
+		/* ipmi is supposed to fail here */
+		break;
+	}
+	if (!np)
+		return ret;
+	parent = of_get_parent(np);
+	if (parent) {
+		if (strcmp(parent->type, "isa") == 0)
+			ret = 0;
+		of_node_put(parent);
+	}
+	of_node_put(np);
+	return ret;
+}
+EXPORT_SYMBOL(check_legacy_ioport);
+
+static int ppc_panic_event(struct notifier_block *this,
+                             unsigned long event, void *ptr)
+{
+	/*
+	 * panic does a local_irq_disable, but we really
+	 * want interrupts to be hard disabled.
+	 */
+	hard_irq_disable();
+
+	/*
+	 * If firmware-assisted dump has been registered then trigger
+	 * firmware-assisted dump and let firmware handle everything else.
+	 */
+	crash_fadump(NULL, ptr);
+	if (ppc_md.panic)
+		ppc_md.panic(ptr);  /* May not return */
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ppc_panic_block = {
+	.notifier_call = ppc_panic_event,
+	.priority = INT_MIN /* may not return; must be done last */
+};
+
+void __init setup_panic(void)
+{
+	/* PPC64 always does a hard irq disable in its panic handler */
+	if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic)
+		return;
+	atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block);
+}
+
+#ifdef CONFIG_CHECK_CACHE_COHERENCY
+/*
+ * For platforms that have configurable cache-coherency.  This function
+ * checks that the cache coherency setting of the kernel matches the setting
+ * left by the firmware, as indicated in the device tree.  Since a mismatch
+ * will eventually result in DMA failures, we print * and error and call
+ * BUG() in that case.
+ */
+
+#ifdef CONFIG_NOT_COHERENT_CACHE
+#define KERNEL_COHERENCY	0
+#else
+#define KERNEL_COHERENCY	1
+#endif
+
+static int __init check_cache_coherency(void)
+{
+	struct device_node *np;
+	const void *prop;
+	int devtree_coherency;
+
+	np = of_find_node_by_path("/");
+	prop = of_get_property(np, "coherency-off", NULL);
+	of_node_put(np);
+
+	devtree_coherency = prop ? 0 : 1;
+
+	if (devtree_coherency != KERNEL_COHERENCY) {
+		printk(KERN_ERR
+			"kernel coherency:%s != device tree_coherency:%s\n",
+			KERNEL_COHERENCY ? "on" : "off",
+			devtree_coherency ? "on" : "off");
+		BUG();
+	}
+
+	return 0;
+}
+
+late_initcall(check_cache_coherency);
+#endif /* CONFIG_CHECK_CACHE_COHERENCY */
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *powerpc_debugfs_root;
+EXPORT_SYMBOL(powerpc_debugfs_root);
+
+static int powerpc_debugfs_init(void)
+{
+	powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL);
+
+	return powerpc_debugfs_root == NULL;
+}
+arch_initcall(powerpc_debugfs_init);
+#endif
+
+void ppc_printk_progress(char *s, unsigned short hex)
+{
+	pr_info("%s\n", s);
+}
+
+void arch_setup_pdev_archdata(struct platform_device *pdev)
+{
+	pdev->archdata.dma_mask = DMA_BIT_MASK(32);
+	pdev->dev.dma_mask = &pdev->archdata.dma_mask;
+ 	set_dma_ops(&pdev->dev, &dma_nommu_ops);
+}
+
+static __init void print_system_info(void)
+{
+	pr_info("-----------------------------------------------------\n");
+#ifdef CONFIG_PPC_BOOK3S_64
+	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
+#endif
+#ifdef CONFIG_PPC_STD_MMU_32
+	pr_info("Hash_size         = 0x%lx\n", Hash_size);
+#endif
+	pr_info("phys_mem_size     = 0x%llx\n",
+		(unsigned long long)memblock_phys_mem_size());
+
+	pr_info("dcache_bsize      = 0x%x\n", dcache_bsize);
+	pr_info("icache_bsize      = 0x%x\n", icache_bsize);
+	if (ucache_bsize != 0)
+		pr_info("ucache_bsize      = 0x%x\n", ucache_bsize);
+
+	pr_info("cpu_features      = 0x%016lx\n", cur_cpu_spec->cpu_features);
+	pr_info("  possible        = 0x%016lx\n",
+		(unsigned long)CPU_FTRS_POSSIBLE);
+	pr_info("  always          = 0x%016lx\n",
+		(unsigned long)CPU_FTRS_ALWAYS);
+	pr_info("cpu_user_features = 0x%08x 0x%08x\n",
+		cur_cpu_spec->cpu_user_features,
+		cur_cpu_spec->cpu_user_features2);
+	pr_info("mmu_features      = 0x%08x\n", cur_cpu_spec->mmu_features);
+#ifdef CONFIG_PPC64
+	pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features);
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (htab_address)
+		pr_info("htab_address      = 0x%p\n", htab_address);
+	if (htab_hash_mask)
+		pr_info("htab_hash_mask    = 0x%lx\n", htab_hash_mask);
+#endif
+#ifdef CONFIG_PPC_STD_MMU_32
+	if (Hash)
+		pr_info("Hash              = 0x%p\n", Hash);
+	if (Hash_mask)
+		pr_info("Hash_mask         = 0x%lx\n", Hash_mask);
+#endif
+
+	if (PHYSICAL_START > 0)
+		pr_info("physical_start    = 0x%llx\n",
+		       (unsigned long long)PHYSICAL_START);
+	pr_info("-----------------------------------------------------\n");
+}
+
+#ifdef CONFIG_SMP
+static void smp_setup_pacas(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == smp_processor_id())
+			continue;
+		allocate_paca(cpu);
+		set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]);
+	}
+
+	memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32));
+	cpu_to_phys_id = NULL;
+}
+#endif
+
+/*
+ * Called into from start_kernel this initializes memblock, which is used
+ * to manage page allocation until mem_init is called.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+	*cmdline_p = boot_command_line;
+
+	/* Set a half-reasonable default so udelay does something sensible */
+	loops_per_jiffy = 500000000 / HZ;
+
+	/* Unflatten the device-tree passed by prom_init or kexec */
+	unflatten_device_tree();
+
+	/*
+	 * Initialize cache line/block info from device-tree (on ppc64) or
+	 * just cputable (on ppc32).
+	 */
+	initialize_cache_info();
+
+	/* Initialize RTAS if available. */
+	rtas_initialize();
+
+	/* Check if we have an initrd provided via the device-tree. */
+	check_for_initrd();
+
+	/* Probe the machine type, establish ppc_md. */
+	probe_machine();
+
+	/* Setup panic notifier if requested by the platform. */
+	setup_panic();
+
+	/*
+	 * Configure ppc_md.power_save (ppc32 only, 64-bit machines do
+	 * it from their respective probe() function.
+	 */
+	setup_power_save();
+
+	/* Discover standard serial ports. */
+	find_legacy_serial_ports();
+
+	/* Register early console with the printk subsystem. */
+	register_early_udbg_console();
+
+	/* Setup the various CPU maps based on the device-tree. */
+	smp_setup_cpu_maps();
+
+	/* Initialize xmon. */
+	xmon_setup();
+
+	/* Check the SMT related command line arguments (ppc64). */
+	check_smt_enabled();
+
+	/* Parse memory topology */
+	mem_topology_setup();
+
+	/*
+	 * Release secondary cpus out of their spinloops at 0x60 now that
+	 * we can map physical -> logical CPU ids.
+	 *
+	 * Freescale Book3e parts spin in a loop provided by firmware,
+	 * so smp_release_cpus() does nothing for them.
+	 */
+#ifdef CONFIG_SMP
+	smp_setup_pacas();
+
+	/* On BookE, setup per-core TLB data structures. */
+	setup_tlb_core_data();
+#endif
+
+	/* Print various info about the machine that has been gathered so far. */
+	print_system_info();
+
+	/* Reserve large chunks of memory for use by CMA for KVM. */
+	kvm_cma_reserve();
+
+	klp_init_thread_info(&init_thread_info);
+
+	init_mm.start_code = (unsigned long)_stext;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
+	init_mm.brk = klimit;
+
+#ifdef CONFIG_PPC_MM_SLICES
+#ifdef CONFIG_PPC64
+	if (!radix_enabled())
+		init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+#elif defined(CONFIG_PPC_8xx)
+	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
+#else
+#error	"context.addr_limit not initialized."
+#endif
+#endif
+
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(&init_mm);
+#endif
+	irqstack_early_init();
+	exc_lvl_early_init();
+	emergency_stack_init();
+
+	smp_release_cpus();
+
+	initmem_init();
+
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+	if (ppc_md.setup_arch)
+		ppc_md.setup_arch();
+
+	setup_barrier_nospec();
+	setup_spectre_v2();
+
+	paging_init();
+
+	/* Initialize the MMU context management stuff. */
+	mmu_context_init();
+
+#ifdef CONFIG_PPC64
+	/* Interrupt code needs to be 64K-aligned. */
+	if ((unsigned long)_stext & 0xffff)
+		panic("Kernelbase not 64K-aligned (0x%lx)!\n",
+		      (unsigned long)_stext);
+#endif
+}
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
new file mode 100644
index 000000000..c6a592b67
--- /dev/null
+++ b/arch/powerpc/kernel/setup.h
@@ -0,0 +1,71 @@
+/*
+ * Prototypes for functions that are shared between setup_(32|64|common).c
+ *
+ * Copyright 2016 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef __ARCH_POWERPC_KERNEL_SETUP_H
+#define __ARCH_POWERPC_KERNEL_SETUP_H
+
+void initialize_cache_info(void);
+void irqstack_early_init(void);
+
+#ifdef CONFIG_PPC32
+void setup_power_save(void);
+#else
+static inline void setup_power_save(void) { };
+#endif
+
+#if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
+void check_smt_enabled(void);
+#else
+static inline void check_smt_enabled(void) { };
+#endif
+
+#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
+void setup_tlb_core_data(void);
+#else
+static inline void setup_tlb_core_data(void) { };
+#endif
+
+#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+void exc_lvl_early_init(void);
+#else
+static inline void exc_lvl_early_init(void) { };
+#endif
+
+#ifdef CONFIG_PPC64
+void emergency_stack_init(void);
+#else
+static inline void emergency_stack_init(void) { };
+#endif
+
+#ifdef CONFIG_PPC64
+u64 ppc64_bolted_size(void);
+
+/* Default SPR values from firmware/kexec */
+extern unsigned long spr_default_dscr;
+#endif
+
+/*
+ * Having this in kvm_ppc.h makes include dependencies too
+ * tricky to solve for setup-common.c so have it here.
+ */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void kvm_cma_reserve(void);
+#else
+static inline void kvm_cma_reserve(void) { };
+#endif
+
+#ifdef CONFIG_TAU
+u32 cpu_temp(unsigned long cpu);
+u32 cpu_temp_both(unsigned long cpu);
+u32 tau_interrupts(unsigned long cpu);
+#endif /* CONFIG_TAU */
+
+#endif /* __ARCH_POWERPC_KERNEL_SETUP_H */
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
new file mode 100644
index 000000000..8c507be12
--- /dev/null
+++ b/arch/powerpc/kernel/setup_32.c
@@ -0,0 +1,268 @@
+/*
+ * Common prep/pmac/chrp boot and setup code.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/tty.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/memblock.h>
+#include <linux/export.h>
+
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+#include <asm/elf.h>
+#include <asm/cputable.h>
+#include <asm/bootx.h>
+#include <asm/btext.h>
+#include <asm/machdep.h>
+#include <linux/uaccess.h>
+#include <asm/pmac_feature.h>
+#include <asm/sections.h>
+#include <asm/nvram.h>
+#include <asm/xmon.h>
+#include <asm/time.h>
+#include <asm/serial.h>
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+#include <asm/cpu_has_feature.h>
+#include <asm/asm-prototypes.h>
+#include <asm/kdump.h>
+#include <asm/feature-fixups.h>
+
+#include "setup.h"
+
+#define DBG(fmt...)
+
+extern void bootx_init(unsigned long r4, unsigned long phys);
+
+int boot_cpuid_phys;
+EXPORT_SYMBOL_GPL(boot_cpuid_phys);
+
+int smp_hw_index[NR_CPUS];
+EXPORT_SYMBOL(smp_hw_index);
+
+unsigned long ISA_DMA_THRESHOLD;
+unsigned int DMA_MODE_READ;
+unsigned int DMA_MODE_WRITE;
+
+EXPORT_SYMBOL(ISA_DMA_THRESHOLD);
+EXPORT_SYMBOL(DMA_MODE_READ);
+EXPORT_SYMBOL(DMA_MODE_WRITE);
+
+/*
+ * We're called here very early in the boot.
+ *
+ * Note that the kernel may be running at an address which is different
+ * from the address that it was linked at, so we must use RELOC/PTRRELOC
+ * to access static data (including strings).  -- paulus
+ */
+notrace unsigned long __init early_init(unsigned long dt_ptr)
+{
+	unsigned long offset = reloc_offset();
+
+	/* First zero the BSS -- use memset_io, some platforms don't have
+	 * caches on yet */
+	memset_io((void __iomem *)PTRRELOC(&__bss_start), 0,
+			__bss_stop - __bss_start);
+
+	/*
+	 * Identify the CPU type and fix up code sections
+	 * that depend on which cpu we have.
+	 */
+	identify_cpu(offset, mfspr(SPRN_PVR));
+
+	apply_feature_fixups();
+
+	return KERNELBASE + offset;
+}
+
+
+/*
+ * This is run before start_kernel(), the kernel has been relocated
+ * and we are running with enough of the MMU enabled to have our
+ * proper kernel virtual addresses
+ *
+ * We do the initial parsing of the flat device-tree and prepares
+ * for the MMU to be fully initialized.
+ */
+notrace void __init machine_init(u64 dt_ptr)
+{
+	unsigned int *addr = (unsigned int *)((unsigned long)&patch__memset_nocache +
+					       patch__memset_nocache);
+	unsigned long insn;
+
+	/* Configure static keys first, now that we're relocated. */
+	setup_feature_keys();
+
+	/* Enable early debugging if any specified (see udbg.h) */
+	udbg_early_init();
+
+	patch_instruction_site(&patch__memcpy_nocache, PPC_INST_NOP);
+
+	insn = create_cond_branch(addr, branch_target(addr), 0x820000);
+	patch_instruction(addr, insn);	/* replace b by bne cr0 */
+
+	/* Do some early initialization based on the flat device tree */
+	early_init_devtree(__va(dt_ptr));
+
+	early_init_mmu();
+
+	setup_kdump_trampoline();
+}
+
+/* Checks "l2cr=xxxx" command-line option */
+static int __init ppc_setup_l2cr(char *str)
+{
+	if (cpu_has_feature(CPU_FTR_L2CR)) {
+		unsigned long val = simple_strtoul(str, NULL, 0);
+		printk(KERN_INFO "l2cr set to %lx\n", val);
+		_set_L2CR(0);		/* force invalidate by disable cache */
+		_set_L2CR(val);		/* and enable it */
+	}
+	return 1;
+}
+__setup("l2cr=", ppc_setup_l2cr);
+
+/* Checks "l3cr=xxxx" command-line option */
+static int __init ppc_setup_l3cr(char *str)
+{
+	if (cpu_has_feature(CPU_FTR_L3CR)) {
+		unsigned long val = simple_strtoul(str, NULL, 0);
+		printk(KERN_INFO "l3cr set to %lx\n", val);
+		_set_L3CR(val);		/* and enable it */
+	}
+	return 1;
+}
+__setup("l3cr=", ppc_setup_l3cr);
+
+#ifdef CONFIG_GENERIC_NVRAM
+
+/* Generic nvram hooks used by drivers/char/gen_nvram.c */
+unsigned char nvram_read_byte(int addr)
+{
+	if (ppc_md.nvram_read_val)
+		return ppc_md.nvram_read_val(addr);
+	return 0xff;
+}
+EXPORT_SYMBOL(nvram_read_byte);
+
+void nvram_write_byte(unsigned char val, int addr)
+{
+	if (ppc_md.nvram_write_val)
+		ppc_md.nvram_write_val(addr, val);
+}
+EXPORT_SYMBOL(nvram_write_byte);
+
+ssize_t nvram_get_size(void)
+{
+	if (ppc_md.nvram_size)
+		return ppc_md.nvram_size();
+	return -1;
+}
+EXPORT_SYMBOL(nvram_get_size);
+
+void nvram_sync(void)
+{
+	if (ppc_md.nvram_sync)
+		ppc_md.nvram_sync();
+}
+EXPORT_SYMBOL(nvram_sync);
+
+#endif /* CONFIG_NVRAM */
+
+static int __init ppc_init(void)
+{
+	/* clear the progress line */
+	if (ppc_md.progress)
+		ppc_md.progress("             ", 0xffff);
+
+	/* call platform init */
+	if (ppc_md.init != NULL) {
+		ppc_md.init();
+	}
+	return 0;
+}
+arch_initcall(ppc_init);
+
+void __init irqstack_early_init(void)
+{
+	unsigned int i;
+
+	/* interrupt stacks must be in lowmem, we get that for free on ppc32
+	 * as the memblock is limited to lowmem by default */
+	for_each_possible_cpu(i) {
+		softirq_ctx[i] = (struct thread_info *)
+			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+		hardirq_ctx[i] = (struct thread_info *)
+			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+	}
+}
+
+#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+void __init exc_lvl_early_init(void)
+{
+	unsigned int i, hw_cpu;
+
+	/* interrupt stacks must be in lowmem, we get that for free on ppc32
+	 * as the memblock is limited to lowmem by MEMBLOCK_REAL_LIMIT */
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_SMP
+		hw_cpu = get_hard_smp_processor_id(i);
+#else
+		hw_cpu = 0;
+#endif
+
+		critirq_ctx[hw_cpu] = (struct thread_info *)
+			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+#ifdef CONFIG_BOOKE
+		dbgirq_ctx[hw_cpu] = (struct thread_info *)
+			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+		mcheckirq_ctx[hw_cpu] = (struct thread_info *)
+			__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+#endif
+	}
+}
+#endif
+
+void __init setup_power_save(void)
+{
+#ifdef CONFIG_6xx
+	if (cpu_has_feature(CPU_FTR_CAN_DOZE) ||
+	    cpu_has_feature(CPU_FTR_CAN_NAP))
+		ppc_md.power_save = ppc6xx_idle;
+#endif
+
+#ifdef CONFIG_E500
+	if (cpu_has_feature(CPU_FTR_CAN_DOZE) ||
+	    cpu_has_feature(CPU_FTR_CAN_NAP))
+		ppc_md.power_save = e500_idle;
+#endif
+}
+
+__init void initialize_cache_info(void)
+{
+	/*
+	 * Set cache line size based on type of cpu as a default.
+	 * Systems with OF can look in the properties on the cpu node(s)
+	 * for a possibly more accurate value.
+	 */
+	dcache_bsize = cur_cpu_spec->dcache_bsize;
+	icache_bsize = cur_cpu_spec->icache_bsize;
+	ucache_bsize = 0;
+	if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE))
+		ucache_bsize = icache_bsize = dcache_bsize;
+}
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
new file mode 100644
index 000000000..122365624
--- /dev/null
+++ b/arch/powerpc/kernel/setup_64.c
@@ -0,0 +1,1118 @@
+/*
+ * 
+ * Common boot and setup code.
+ *
+ * Copyright (C) 2001 PPC64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/delay.h>
+#include <linux/initrd.h>
+#include <linux/seq_file.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/utsname.h>
+#include <linux/tty.h>
+#include <linux/root_dev.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/unistd.h>
+#include <linux/serial.h>
+#include <linux/serial_8250.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/nmi.h>
+
+#include <asm/debugfs.h>
+#include <asm/io.h>
+#include <asm/kdump.h>
+#include <asm/prom.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/smp.h>
+#include <asm/elf.h>
+#include <asm/machdep.h>
+#include <asm/paca.h>
+#include <asm/time.h>
+#include <asm/cputable.h>
+#include <asm/dt_cpu_ftrs.h>
+#include <asm/sections.h>
+#include <asm/btext.h>
+#include <asm/nvram.h>
+#include <asm/setup.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/serial.h>
+#include <asm/cache.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+#include <asm/xmon.h>
+#include <asm/udbg.h>
+#include <asm/kexec.h>
+#include <asm/code-patching.h>
+#include <asm/livepatch.h>
+#include <asm/opal.h>
+#include <asm/cputhreads.h>
+#include <asm/hw_irq.h>
+#include <asm/feature-fixups.h>
+
+#include "setup.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+int spinning_secondaries;
+u64 ppc64_pft_size;
+
+struct ppc64_caches ppc64_caches = {
+	.l1d = {
+		.block_size = 0x40,
+		.log_block_size = 6,
+	},
+	.l1i = {
+		.block_size = 0x40,
+		.log_block_size = 6
+	},
+};
+EXPORT_SYMBOL_GPL(ppc64_caches);
+
+#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
+void __init setup_tlb_core_data(void)
+{
+	int cpu;
+
+	BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0);
+
+	for_each_possible_cpu(cpu) {
+		int first = cpu_first_thread_sibling(cpu);
+
+		/*
+		 * If we boot via kdump on a non-primary thread,
+		 * make sure we point at the thread that actually
+		 * set up this TLB.
+		 */
+		if (cpu_first_thread_sibling(boot_cpuid) == first)
+			first = boot_cpuid;
+
+		paca_ptrs[cpu]->tcd_ptr = &paca_ptrs[first]->tcd;
+
+		/*
+		 * If we have threads, we need either tlbsrx.
+		 * or e6500 tablewalk mode, or else TLB handlers
+		 * will be racy and could produce duplicate entries.
+		 * Should we panic instead?
+		 */
+		WARN_ONCE(smt_enabled_at_boot >= 2 &&
+			  !mmu_has_feature(MMU_FTR_USE_TLBRSRV) &&
+			  book3e_htw_mode != PPC_HTW_E6500,
+			  "%s: unsupported MMU configuration\n", __func__);
+	}
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+static char *smt_enabled_cmdline;
+
+/* Look for ibm,smt-enabled OF option */
+void __init check_smt_enabled(void)
+{
+	struct device_node *dn;
+	const char *smt_option;
+
+	/* Default to enabling all threads */
+	smt_enabled_at_boot = threads_per_core;
+
+	/* Allow the command line to overrule the OF option */
+	if (smt_enabled_cmdline) {
+		if (!strcmp(smt_enabled_cmdline, "on"))
+			smt_enabled_at_boot = threads_per_core;
+		else if (!strcmp(smt_enabled_cmdline, "off"))
+			smt_enabled_at_boot = 0;
+		else {
+			int smt;
+			int rc;
+
+			rc = kstrtoint(smt_enabled_cmdline, 10, &smt);
+			if (!rc)
+				smt_enabled_at_boot =
+					min(threads_per_core, smt);
+		}
+	} else {
+		dn = of_find_node_by_path("/options");
+		if (dn) {
+			smt_option = of_get_property(dn, "ibm,smt-enabled",
+						     NULL);
+
+			if (smt_option) {
+				if (!strcmp(smt_option, "on"))
+					smt_enabled_at_boot = threads_per_core;
+				else if (!strcmp(smt_option, "off"))
+					smt_enabled_at_boot = 0;
+			}
+
+			of_node_put(dn);
+		}
+	}
+}
+
+/* Look for smt-enabled= cmdline option */
+static int __init early_smt_enabled(char *p)
+{
+	smt_enabled_cmdline = p;
+	return 0;
+}
+early_param("smt-enabled", early_smt_enabled);
+
+#endif /* CONFIG_SMP */
+
+/** Fix up paca fields required for the boot cpu */
+static void __init fixup_boot_paca(void)
+{
+	/* The boot cpu is started */
+	get_paca()->cpu_start = 1;
+	/* Allow percpu accesses to work until we setup percpu data */
+	get_paca()->data_offset = 0;
+	/* Mark interrupts disabled in PACA */
+	irq_soft_mask_set(IRQS_DISABLED);
+}
+
+static void __init configure_exceptions(void)
+{
+	/*
+	 * Setup the trampolines from the lowmem exception vectors
+	 * to the kdump kernel when not using a relocatable kernel.
+	 */
+	setup_kdump_trampoline();
+
+	/* Under a PAPR hypervisor, we need hypercalls */
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		/* Enable AIL if possible */
+		pseries_enable_reloc_on_exc();
+
+		/*
+		 * Tell the hypervisor that we want our exceptions to
+		 * be taken in little endian mode.
+		 *
+		 * We don't call this for big endian as our calling convention
+		 * makes us always enter in BE, and the call may fail under
+		 * some circumstances with kdump.
+		 */
+#ifdef __LITTLE_ENDIAN__
+		pseries_little_endian_exceptions();
+#endif
+	} else {
+		/* Set endian mode using OPAL */
+		if (firmware_has_feature(FW_FEATURE_OPAL))
+			opal_configure_cores();
+
+		/* AIL on native is done in cpu_ready_for_interrupts() */
+	}
+}
+
+static void cpu_ready_for_interrupts(void)
+{
+	/*
+	 * Enable AIL if supported, and we are in hypervisor mode. This
+	 * is called once for every processor.
+	 *
+	 * If we are not in hypervisor mode the job is done once for
+	 * the whole partition in configure_exceptions().
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		unsigned long lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
+	}
+
+	/*
+	 * Set HFSCR:TM based on CPU features:
+	 * In the special case of TM no suspend (P9N DD2.1), Linux is
+	 * told TM is off via the dt-ftrs but told to (partially) use
+	 * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM]
+	 * will be off from dt-ftrs but we need to turn it on for the
+	 * no suspend case.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (cpu_has_feature(CPU_FTR_TM_COMP))
+			mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM);
+		else
+			mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM);
+	}
+
+	/* Set IR and DR in PACA MSR */
+	get_paca()->kernel_msr = MSR_KERNEL;
+}
+
+unsigned long spr_default_dscr = 0;
+
+void __init record_spr_defaults(void)
+{
+	if (early_cpu_has_feature(CPU_FTR_DSCR))
+		spr_default_dscr = mfspr(SPRN_DSCR);
+}
+
+/*
+ * Early initialization entry point. This is called by head.S
+ * with MMU translation disabled. We rely on the "feature" of
+ * the CPU that ignores the top 2 bits of the address in real
+ * mode so we can access kernel globals normally provided we
+ * only toy with things in the RMO region. From here, we do
+ * some early parsing of the device-tree to setup out MEMBLOCK
+ * data structures, and allocate & initialize the hash table
+ * and segment tables so we can start running with translation
+ * enabled.
+ *
+ * It is this function which will call the probe() callback of
+ * the various platform types and copy the matching one to the
+ * global ppc_md structure. Your platform can eventually do
+ * some very early initializations from the probe() routine, but
+ * this is not recommended, be very careful as, for example, the
+ * device-tree is not accessible via normal means at this point.
+ */
+
+void __init early_setup(unsigned long dt_ptr)
+{
+	static __initdata struct paca_struct boot_paca;
+
+	/* -------- printk is _NOT_ safe to use here ! ------- */
+
+	/* Try new device tree based feature discovery ... */
+	if (!dt_cpu_ftrs_init(__va(dt_ptr)))
+		/* Otherwise use the old style CPU table */
+		identify_cpu(0, mfspr(SPRN_PVR));
+
+	/* Assume we're on cpu 0 for now. Don't write to the paca yet! */
+	initialise_paca(&boot_paca, 0);
+	setup_paca(&boot_paca);
+	fixup_boot_paca();
+
+	/* -------- printk is now safe to use ------- */
+
+	/* Enable early debugging if any specified (see udbg.h) */
+	udbg_early_init();
+
+ 	DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr);
+
+	/*
+	 * Do early initialization using the flattened device
+	 * tree, such as retrieving the physical memory map or
+	 * calculating/retrieving the hash table size.
+	 */
+	early_init_devtree(__va(dt_ptr));
+
+	/* Now we know the logical id of our boot cpu, setup the paca. */
+	if (boot_cpuid != 0) {
+		/* Poison paca_ptrs[0] again if it's not the boot cpu */
+		memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0]));
+	}
+	setup_paca(paca_ptrs[boot_cpuid]);
+	fixup_boot_paca();
+
+	/*
+	 * Configure exception handlers. This include setting up trampolines
+	 * if needed, setting exception endian mode, etc...
+	 */
+	configure_exceptions();
+
+	/* Apply all the dynamic patching */
+	apply_feature_fixups();
+	setup_feature_keys();
+
+	/* Initialize the hash table or TLB handling */
+	early_init_mmu();
+
+	/*
+	 * After firmware and early platform setup code has set things up,
+	 * we note the SPR values for configurable control/performance
+	 * registers, and use those as initial defaults.
+	 */
+	record_spr_defaults();
+
+	/*
+	 * At this point, we can let interrupts switch to virtual mode
+	 * (the MMU has been setup), so adjust the MSR in the PACA to
+	 * have IR and DR set and enable AIL if it exists
+	 */
+	cpu_ready_for_interrupts();
+
+	/*
+	 * We enable ftrace here, but since we only support DYNAMIC_FTRACE, it
+	 * will only actually get enabled on the boot cpu much later once
+	 * ftrace itself has been initialized.
+	 */
+	this_cpu_enable_ftrace();
+
+	DBG(" <- early_setup()\n");
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+	/*
+	 * This needs to be done *last* (after the above DBG() even)
+	 *
+	 * Right after we return from this function, we turn on the MMU
+	 * which means the real-mode access trick that btext does will
+	 * no longer work, it needs to switch to using a real MMU
+	 * mapping. This call will ensure that it does
+	 */
+	btext_map();
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
+}
+
+#ifdef CONFIG_SMP
+void early_setup_secondary(void)
+{
+	/* Mark interrupts disabled in PACA */
+	irq_soft_mask_set(IRQS_DISABLED);
+
+	/* Initialize the hash table or TLB handling */
+	early_init_mmu_secondary();
+
+	/*
+	 * At this point, we can let interrupts switch to virtual mode
+	 * (the MMU has been setup), so adjust the MSR in the PACA to
+	 * have IR and DR set.
+	 */
+	cpu_ready_for_interrupts();
+}
+
+#endif /* CONFIG_SMP */
+
+void panic_smp_self_stop(void)
+{
+	hard_irq_disable();
+	spin_begin();
+	while (1)
+		spin_cpu_relax();
+}
+
+#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
+static bool use_spinloop(void)
+{
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
+		/*
+		 * See comments in head_64.S -- not all platforms insert
+		 * secondaries at __secondary_hold and wait at the spin
+		 * loop.
+		 */
+		if (firmware_has_feature(FW_FEATURE_OPAL))
+			return false;
+		return true;
+	}
+
+	/*
+	 * When book3e boots from kexec, the ePAPR spin table does
+	 * not get used.
+	 */
+	return of_property_read_bool(of_chosen, "linux,booted-from-kexec");
+}
+
+void smp_release_cpus(void)
+{
+	unsigned long *ptr;
+	int i;
+
+	if (!use_spinloop())
+		return;
+
+	DBG(" -> smp_release_cpus()\n");
+
+	/* All secondary cpus are spinning on a common spinloop, release them
+	 * all now so they can start to spin on their individual paca
+	 * spinloops. For non SMP kernels, the secondary cpus never get out
+	 * of the common spinloop.
+	 */
+
+	ptr  = (unsigned long *)((unsigned long)&__secondary_hold_spinloop
+			- PHYSICAL_START);
+	*ptr = ppc_function_entry(generic_secondary_smp_init);
+
+	/* And wait a bit for them to catch up */
+	for (i = 0; i < 100000; i++) {
+		mb();
+		HMT_low();
+		if (spinning_secondaries == 0)
+			break;
+		udelay(1);
+	}
+	DBG("spinning_secondaries = %d\n", spinning_secondaries);
+
+	DBG(" <- smp_release_cpus()\n");
+}
+#endif /* CONFIG_SMP || CONFIG_KEXEC_CORE */
+
+/*
+ * Initialize some remaining members of the ppc64_caches and systemcfg
+ * structures
+ * (at least until we get rid of them completely). This is mostly some
+ * cache informations about the CPU that will be used by cache flush
+ * routines and/or provided to userland
+ */
+
+static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize,
+			    u32 bsize, u32 sets)
+{
+	info->size = size;
+	info->sets = sets;
+	info->line_size = lsize;
+	info->block_size = bsize;
+	info->log_block_size = __ilog2(bsize);
+	if (bsize)
+		info->blocks_per_page = PAGE_SIZE / bsize;
+	else
+		info->blocks_per_page = 0;
+
+	if (sets == 0)
+		info->assoc = 0xffff;
+	else
+		info->assoc = size / (sets * lsize);
+}
+
+static bool __init parse_cache_info(struct device_node *np,
+				    bool icache,
+				    struct ppc_cache_info *info)
+{
+	static const char *ipropnames[] __initdata = {
+		"i-cache-size",
+		"i-cache-sets",
+		"i-cache-block-size",
+		"i-cache-line-size",
+	};
+	static const char *dpropnames[] __initdata = {
+		"d-cache-size",
+		"d-cache-sets",
+		"d-cache-block-size",
+		"d-cache-line-size",
+	};
+	const char **propnames = icache ? ipropnames : dpropnames;
+	const __be32 *sizep, *lsizep, *bsizep, *setsp;
+	u32 size, lsize, bsize, sets;
+	bool success = true;
+
+	size = 0;
+	sets = -1u;
+	lsize = bsize = cur_cpu_spec->dcache_bsize;
+	sizep = of_get_property(np, propnames[0], NULL);
+	if (sizep != NULL)
+		size = be32_to_cpu(*sizep);
+	setsp = of_get_property(np, propnames[1], NULL);
+	if (setsp != NULL)
+		sets = be32_to_cpu(*setsp);
+	bsizep = of_get_property(np, propnames[2], NULL);
+	lsizep = of_get_property(np, propnames[3], NULL);
+	if (bsizep == NULL)
+		bsizep = lsizep;
+	if (lsizep == NULL)
+		lsizep = bsizep;
+	if (lsizep != NULL)
+		lsize = be32_to_cpu(*lsizep);
+	if (bsizep != NULL)
+		bsize = be32_to_cpu(*bsizep);
+	if (sizep == NULL || bsizep == NULL || lsizep == NULL)
+		success = false;
+
+	/*
+	 * OF is weird .. it represents fully associative caches
+	 * as "1 way" which doesn't make much sense and doesn't
+	 * leave room for direct mapped. We'll assume that 0
+	 * in OF means direct mapped for that reason.
+	 */
+	if (sets == 1)
+		sets = 0;
+	else if (sets == 0)
+		sets = 1;
+
+	init_cache_info(info, size, lsize, bsize, sets);
+
+	return success;
+}
+
+void __init initialize_cache_info(void)
+{
+	struct device_node *cpu = NULL, *l2, *l3 = NULL;
+	u32 pvr;
+
+	DBG(" -> initialize_cache_info()\n");
+
+	/*
+	 * All shipping POWER8 machines have a firmware bug that
+	 * puts incorrect information in the device-tree. This will
+	 * be (hopefully) fixed for future chips but for now hard
+	 * code the values if we are running on one of these
+	 */
+	pvr = PVR_VER(mfspr(SPRN_PVR));
+	if (pvr == PVR_POWER8 || pvr == PVR_POWER8E ||
+	    pvr == PVR_POWER8NVL) {
+						/* size    lsize   blk  sets */
+		init_cache_info(&ppc64_caches.l1i, 0x8000,   128,  128, 32);
+		init_cache_info(&ppc64_caches.l1d, 0x10000,  128,  128, 64);
+		init_cache_info(&ppc64_caches.l2,  0x80000,  128,  0,   512);
+		init_cache_info(&ppc64_caches.l3,  0x800000, 128,  0,   8192);
+	} else
+		cpu = of_find_node_by_type(NULL, "cpu");
+
+	/*
+	 * We're assuming *all* of the CPUs have the same
+	 * d-cache and i-cache sizes... -Peter
+	 */
+	if (cpu) {
+		if (!parse_cache_info(cpu, false, &ppc64_caches.l1d))
+			DBG("Argh, can't find dcache properties !\n");
+
+		if (!parse_cache_info(cpu, true, &ppc64_caches.l1i))
+			DBG("Argh, can't find icache properties !\n");
+
+		/*
+		 * Try to find the L2 and L3 if any. Assume they are
+		 * unified and use the D-side properties.
+		 */
+		l2 = of_find_next_cache_node(cpu);
+		of_node_put(cpu);
+		if (l2) {
+			parse_cache_info(l2, false, &ppc64_caches.l2);
+			l3 = of_find_next_cache_node(l2);
+			of_node_put(l2);
+		}
+		if (l3) {
+			parse_cache_info(l3, false, &ppc64_caches.l3);
+			of_node_put(l3);
+		}
+	}
+
+	/* For use by binfmt_elf */
+	dcache_bsize = ppc64_caches.l1d.block_size;
+	icache_bsize = ppc64_caches.l1i.block_size;
+
+	cur_cpu_spec->dcache_bsize = dcache_bsize;
+	cur_cpu_spec->icache_bsize = icache_bsize;
+
+	DBG(" <- initialize_cache_info()\n");
+}
+
+/*
+ * This returns the limit below which memory accesses to the linear
+ * mapping are guarnateed not to cause an architectural exception (e.g.,
+ * TLB or SLB miss fault).
+ *
+ * This is used to allocate PACAs and various interrupt stacks that
+ * that are accessed early in interrupt handlers that must not cause
+ * re-entrant interrupts.
+ */
+__init u64 ppc64_bolted_size(void)
+{
+#ifdef CONFIG_PPC_BOOK3E
+	/* Freescale BookE bolts the entire linear mapping */
+	/* XXX: BookE ppc64_rma_limit setup seems to disagree? */
+	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+		return linear_map_top;
+	/* Other BookE, we assume the first GB is bolted */
+	return 1ul << 30;
+#else
+	/* BookS radix, does not take faults on linear mapping */
+	if (early_radix_enabled())
+		return ULONG_MAX;
+
+	/* BookS hash, the first segment is bolted */
+	if (early_mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		return 1UL << SID_SHIFT_1T;
+	return 1UL << SID_SHIFT;
+#endif
+}
+
+static void *__init alloc_stack(unsigned long limit, int cpu)
+{
+	unsigned long pa;
+
+	pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
+					early_cpu_to_node(cpu), MEMBLOCK_NONE);
+	if (!pa) {
+		pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
+		if (!pa)
+			panic("cannot allocate stacks");
+	}
+
+	return __va(pa);
+}
+
+void __init irqstack_early_init(void)
+{
+	u64 limit = ppc64_bolted_size();
+	unsigned int i;
+
+	/*
+	 * Interrupt stacks must be in the first segment since we
+	 * cannot afford to take SLB misses on them. They are not
+	 * accessed in realmode.
+	 */
+	for_each_possible_cpu(i) {
+		softirq_ctx[i] = alloc_stack(limit, i);
+		hardirq_ctx[i] = alloc_stack(limit, i);
+	}
+}
+
+#ifdef CONFIG_PPC_BOOK3E
+void __init exc_lvl_early_init(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i) {
+		void *sp;
+
+		sp = alloc_stack(ULONG_MAX, i);
+		critirq_ctx[i] = sp;
+		paca_ptrs[i]->crit_kstack = sp + THREAD_SIZE;
+
+		sp = alloc_stack(ULONG_MAX, i);
+		dbgirq_ctx[i] = sp;
+		paca_ptrs[i]->dbg_kstack = sp + THREAD_SIZE;
+
+		sp = alloc_stack(ULONG_MAX, i);
+		mcheckirq_ctx[i] = sp;
+		paca_ptrs[i]->mc_kstack = sp + THREAD_SIZE;
+	}
+
+	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
+		patch_exception(0x040, exc_debug_debug_book3e);
+}
+#endif
+
+/*
+ * Emergency stacks are used for a range of things, from asynchronous
+ * NMIs (system reset, machine check) to synchronous, process context.
+ * We set preempt_count to zero, even though that isn't necessarily correct. To
+ * get the right value we'd need to copy it from the previous thread_info, but
+ * doing that might fault causing more problems.
+ * TODO: what to do with accounting?
+ */
+static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu)
+{
+	ti->task = NULL;
+	ti->cpu = cpu;
+	ti->preempt_count = 0;
+	ti->local_flags = 0;
+	ti->flags = 0;
+	klp_init_thread_info(ti);
+}
+
+/*
+ * Stack space used when we detect a bad kernel stack pointer, and
+ * early in SMP boots before relocation is enabled. Exclusive emergency
+ * stack for machine checks.
+ */
+void __init emergency_stack_init(void)
+{
+	u64 limit;
+	unsigned int i;
+
+	/*
+	 * Emergency stacks must be under 256MB, we cannot afford to take
+	 * SLB misses on them. The ABI also requires them to be 128-byte
+	 * aligned.
+	 *
+	 * Since we use these as temporary stacks during secondary CPU
+	 * bringup, machine check, system reset, and HMI, we need to get
+	 * at them in real mode. This means they must also be within the RMO
+	 * region.
+	 *
+	 * The IRQ stacks allocated elsewhere in this file are zeroed and
+	 * initialized in kernel/irq.c. These are initialized here in order
+	 * to have emergency stacks available as early as possible.
+	 */
+	limit = min(ppc64_bolted_size(), ppc64_rma_size);
+
+	for_each_possible_cpu(i) {
+		struct thread_info *ti;
+
+		ti = alloc_stack(limit, i);
+		memset(ti, 0, THREAD_SIZE);
+		emerg_stack_init_thread_info(ti, i);
+		paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+		/* emergency stack for NMI exception handling. */
+		ti = alloc_stack(limit, i);
+		memset(ti, 0, THREAD_SIZE);
+		emerg_stack_init_thread_info(ti, i);
+		paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
+
+		/* emergency stack for machine check exception handling. */
+		ti = alloc_stack(limit, i);
+		memset(ti, 0, THREAD_SIZE);
+		emerg_stack_init_thread_info(ti, i);
+		paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
+#endif
+	}
+}
+
+#ifdef CONFIG_SMP
+#define PCPU_DYN_SIZE		()
+
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
+{
+	return __alloc_bootmem_node(NODE_DATA(early_cpu_to_node(cpu)), size, align,
+				    __pa(MAX_DMA_ADDRESS));
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+	free_bootmem(__pa(ptr), size);
+}
+
+static int pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+	if (early_cpu_to_node(from) == early_cpu_to_node(to))
+		return LOCAL_DISTANCE;
+	else
+		return REMOTE_DISTANCE;
+}
+
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+	const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+	size_t atom_size;
+	unsigned long delta;
+	unsigned int cpu;
+	int rc;
+
+	/*
+	 * Linear mapping is one of 4K, 1M and 16M.  For 4K, no need
+	 * to group units.  For larger mappings, use 1M atom which
+	 * should be large enough to contain a number of units.
+	 */
+	if (mmu_linear_psize == MMU_PAGE_4K)
+		atom_size = PAGE_SIZE;
+	else
+		atom_size = 1 << 20;
+
+	rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance,
+				    pcpu_fc_alloc, pcpu_fc_free);
+	if (rc < 0)
+		panic("cannot initialize percpu area (err=%d)", rc);
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu) {
+                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+		paca_ptrs[cpu]->data_offset = __per_cpu_offset[cpu];
+	}
+}
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+unsigned long memory_block_size_bytes(void)
+{
+	if (ppc_md.memory_block_size)
+		return ppc_md.memory_block_size();
+
+	return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
+#if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
+struct ppc_pci_io ppc_pci_io;
+EXPORT_SYMBOL(ppc_pci_io);
+#endif
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+	return ppc_proc_freq * watchdog_thresh;
+}
+#endif
+
+/*
+ * The perf based hardlockup detector breaks PMU event based branches, so
+ * disable it by default. Book3S has a soft-nmi hardlockup detector based
+ * on the decrementer interrupt, so it does not suffer from this problem.
+ *
+ * It is likely to get false positives in VM guests, so disable it there
+ * by default too.
+ */
+static int __init disable_hardlockup_detector(void)
+{
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+	hardlockup_detector_disable();
+#else
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		hardlockup_detector_disable();
+#endif
+
+	return 0;
+}
+early_initcall(disable_hardlockup_detector);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static enum l1d_flush_type enabled_flush_types;
+static void *l1d_flush_fallback_area;
+static bool no_rfi_flush;
+static bool no_entry_flush;
+static bool no_uaccess_flush;
+bool rfi_flush;
+bool entry_flush;
+bool uaccess_flush;
+DEFINE_STATIC_KEY_FALSE(uaccess_flush_key);
+EXPORT_SYMBOL(uaccess_flush_key);
+
+static int __init handle_no_rfi_flush(char *p)
+{
+	pr_info("rfi-flush: disabled on command line.");
+	no_rfi_flush = true;
+	return 0;
+}
+early_param("no_rfi_flush", handle_no_rfi_flush);
+
+static int __init handle_no_entry_flush(char *p)
+{
+	pr_info("entry-flush: disabled on command line.");
+	no_entry_flush = true;
+	return 0;
+}
+early_param("no_entry_flush", handle_no_entry_flush);
+
+static int __init handle_no_uaccess_flush(char *p)
+{
+	pr_info("uaccess-flush: disabled on command line.");
+	no_uaccess_flush = true;
+	return 0;
+}
+early_param("no_uaccess_flush", handle_no_uaccess_flush);
+
+/*
+ * The RFI flush is not KPTI, but because users will see doco that says to use
+ * nopti we hijack that option here to also disable the RFI flush.
+ */
+static int __init handle_no_pti(char *p)
+{
+	pr_info("rfi-flush: disabling due to 'nopti' on command line.\n");
+	handle_no_rfi_flush(NULL);
+	return 0;
+}
+early_param("nopti", handle_no_pti);
+
+static void do_nothing(void *unused)
+{
+	/*
+	 * We don't need to do the flush explicitly, just enter+exit kernel is
+	 * sufficient, the RFI exit handlers will do the right thing.
+	 */
+}
+
+void rfi_flush_enable(bool enable)
+{
+	if (enable) {
+		do_rfi_flush_fixups(enabled_flush_types);
+		on_each_cpu(do_nothing, NULL, 1);
+	} else
+		do_rfi_flush_fixups(L1D_FLUSH_NONE);
+
+	rfi_flush = enable;
+}
+
+void entry_flush_enable(bool enable)
+{
+	if (enable) {
+		do_entry_flush_fixups(enabled_flush_types);
+		on_each_cpu(do_nothing, NULL, 1);
+	} else {
+		do_entry_flush_fixups(L1D_FLUSH_NONE);
+	}
+
+	entry_flush = enable;
+}
+
+void uaccess_flush_enable(bool enable)
+{
+	if (enable) {
+		do_uaccess_flush_fixups(enabled_flush_types);
+		static_branch_enable(&uaccess_flush_key);
+		on_each_cpu(do_nothing, NULL, 1);
+	} else {
+		static_branch_disable(&uaccess_flush_key);
+		do_uaccess_flush_fixups(L1D_FLUSH_NONE);
+	}
+
+	uaccess_flush = enable;
+}
+
+static void __ref init_fallback_flush(void)
+{
+	u64 l1d_size, limit;
+	int cpu;
+
+	/* Only allocate the fallback flush area once (at boot time). */
+	if (l1d_flush_fallback_area)
+		return;
+
+	l1d_size = ppc64_caches.l1d.size;
+
+	/*
+	 * If there is no d-cache-size property in the device tree, l1d_size
+	 * could be zero. That leads to the loop in the asm wrapping around to
+	 * 2^64-1, and then walking off the end of the fallback area and
+	 * eventually causing a page fault which is fatal. Just default to
+	 * something vaguely sane.
+	 */
+	if (!l1d_size)
+		l1d_size = (64 * 1024);
+
+	limit = min(ppc64_bolted_size(), ppc64_rma_size);
+
+	/*
+	 * Align to L1d size, and size it at 2x L1d size, to catch possible
+	 * hardware prefetch runoff. We don't have a recipe for load patterns to
+	 * reliably avoid the prefetcher.
+	 */
+	l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
+	memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+
+	for_each_possible_cpu(cpu) {
+		struct paca_struct *paca = paca_ptrs[cpu];
+		paca->rfi_flush_fallback_area = l1d_flush_fallback_area;
+		paca->l1d_flush_size = l1d_size;
+	}
+}
+
+void setup_rfi_flush(enum l1d_flush_type types, bool enable)
+{
+	if (types & L1D_FLUSH_FALLBACK) {
+		pr_info("rfi-flush: fallback displacement flush available\n");
+		init_fallback_flush();
+	}
+
+	if (types & L1D_FLUSH_ORI)
+		pr_info("rfi-flush: ori type flush available\n");
+
+	if (types & L1D_FLUSH_MTTRIG)
+		pr_info("rfi-flush: mttrig type flush available\n");
+
+	enabled_flush_types = types;
+
+	if (!cpu_mitigations_off() && !no_rfi_flush)
+		rfi_flush_enable(enable);
+}
+
+void setup_entry_flush(bool enable)
+{
+	if (cpu_mitigations_off())
+		return;
+
+	if (!no_entry_flush)
+		entry_flush_enable(enable);
+}
+
+void setup_uaccess_flush(bool enable)
+{
+	if (cpu_mitigations_off())
+		return;
+
+	if (!no_uaccess_flush)
+		uaccess_flush_enable(enable);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int rfi_flush_set(void *data, u64 val)
+{
+	bool enable;
+
+	if (val == 1)
+		enable = true;
+	else if (val == 0)
+		enable = false;
+	else
+		return -EINVAL;
+
+	/* Only do anything if we're changing state */
+	if (enable != rfi_flush)
+		rfi_flush_enable(enable);
+
+	return 0;
+}
+
+static int rfi_flush_get(void *data, u64 *val)
+{
+	*val = rfi_flush ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n");
+
+static int entry_flush_set(void *data, u64 val)
+{
+	bool enable;
+
+	if (val == 1)
+		enable = true;
+	else if (val == 0)
+		enable = false;
+	else
+		return -EINVAL;
+
+	/* Only do anything if we're changing state */
+	if (enable != entry_flush)
+		entry_flush_enable(enable);
+
+	return 0;
+}
+
+static int entry_flush_get(void *data, u64 *val)
+{
+	*val = entry_flush ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_entry_flush, entry_flush_get, entry_flush_set, "%llu\n");
+
+static int uaccess_flush_set(void *data, u64 val)
+{
+	bool enable;
+
+	if (val == 1)
+		enable = true;
+	else if (val == 0)
+		enable = false;
+	else
+		return -EINVAL;
+
+	/* Only do anything if we're changing state */
+	if (enable != uaccess_flush)
+		uaccess_flush_enable(enable);
+
+	return 0;
+}
+
+static int uaccess_flush_get(void *data, u64 *val)
+{
+	*val = uaccess_flush ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set, "%llu\n");
+
+static __init int rfi_flush_debugfs_init(void)
+{
+	debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
+	debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush);
+	debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush);
+	return 0;
+}
+device_initcall(rfi_flush_debugfs_init);
+#endif
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
new file mode 100644
index 000000000..57b3745f7
--- /dev/null
+++ b/arch/powerpc/kernel/signal.c
@@ -0,0 +1,226 @@
+/*
+ * Common signal handling code for both 32 and 64 bits
+ *
+ *    Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation
+ *    Extracted from signal_32.c and signal_64.c
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file README.legal in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/tracehook.h>
+#include <linux/signal.h>
+#include <linux/uprobes.h>
+#include <linux/key.h>
+#include <linux/context_tracking.h>
+#include <linux/livepatch.h>
+#include <linux/syscalls.h>
+#include <asm/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/debug.h>
+#include <asm/tm.h>
+
+#include "signal.h"
+
+/* Log an error when sending an unhandled signal to a process. Controlled
+ * through debug.exception-trace sysctl.
+ */
+
+int show_unhandled_signals = 1;
+
+/*
+ * Allocate space for the signal frame
+ */
+void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
+			   size_t frame_size, int is_32)
+{
+        unsigned long oldsp, newsp;
+
+        /* Default to using normal stack */
+        oldsp = get_clean_sp(sp, is_32);
+	oldsp = sigsp(oldsp, ksig);
+	newsp = (oldsp - frame_size) & ~0xFUL;
+
+	/* Check access */
+	if (!access_ok(VERIFY_WRITE, (void __user *)newsp, oldsp - newsp))
+		return NULL;
+
+        return (void __user *)newsp;
+}
+
+static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
+				  int has_handler)
+{
+	unsigned long ret = regs->gpr[3];
+	int restart = 1;
+
+	/* syscall ? */
+	if (TRAP(regs) != 0x0C00)
+		return;
+
+	/* error signalled ? */
+	if (!(regs->ccr & 0x10000000))
+		return;
+
+	switch (ret) {
+	case ERESTART_RESTARTBLOCK:
+	case ERESTARTNOHAND:
+		/* ERESTARTNOHAND means that the syscall should only be
+		 * restarted if there was no handler for the signal, and since
+		 * we only get here if there is a handler, we dont restart.
+		 */
+		restart = !has_handler;
+		break;
+	case ERESTARTSYS:
+		/* ERESTARTSYS means to restart the syscall if there is no
+		 * handler or the handler was registered with SA_RESTART
+		 */
+		restart = !has_handler || (ka->sa.sa_flags & SA_RESTART) != 0;
+		break;
+	case ERESTARTNOINTR:
+		/* ERESTARTNOINTR means that the syscall should be
+		 * called again after the signal handler returns.
+		 */
+		break;
+	default:
+		return;
+	}
+	if (restart) {
+		if (ret == ERESTART_RESTARTBLOCK)
+			regs->gpr[0] = __NR_restart_syscall;
+		else
+			regs->gpr[3] = regs->orig_gpr3;
+		regs->nip -= 4;
+		regs->result = 0;
+	} else {
+		regs->result = -EINTR;
+		regs->gpr[3] = EINTR;
+		regs->ccr |= 0x10000000;
+	}
+}
+
+static void do_signal(struct task_struct *tsk)
+{
+	sigset_t *oldset = sigmask_to_save();
+	struct ksignal ksig = { .sig = 0 };
+	int ret;
+	int is32 = is_32bit_task();
+
+	BUG_ON(tsk != current);
+
+	get_signal(&ksig);
+
+	/* Is there any syscall restart business here ? */
+	check_syscall_restart(tsk->thread.regs, &ksig.ka, ksig.sig > 0);
+
+	if (ksig.sig <= 0) {
+		/* No signal to deliver -- put the saved sigmask back */
+		restore_saved_sigmask();
+		tsk->thread.regs->trap = 0;
+		return;               /* no signals delivered */
+	}
+
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+        /*
+	 * Reenable the DABR before delivering the signal to
+	 * user space. The DABR will have been cleared if it
+	 * triggered inside the kernel.
+	 */
+	if (tsk->thread.hw_brk.address && tsk->thread.hw_brk.type)
+		__set_breakpoint(&tsk->thread.hw_brk);
+#endif
+	/* Re-enable the breakpoints for the signal stack */
+	thread_change_pc(tsk, tsk->thread.regs);
+
+	rseq_signal_deliver(&ksig, tsk->thread.regs);
+
+	if (is32) {
+        	if (ksig.ka.sa.sa_flags & SA_SIGINFO)
+			ret = handle_rt_signal32(&ksig, oldset, tsk);
+		else
+			ret = handle_signal32(&ksig, oldset, tsk);
+	} else {
+		ret = handle_rt_signal64(&ksig, oldset, tsk);
+	}
+
+	tsk->thread.regs->trap = 0;
+	signal_setup_done(ret, &ksig, test_thread_flag(TIF_SINGLESTEP));
+}
+
+void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
+{
+	user_exit();
+
+	/* Check valid addr_limit, TIF check is done there */
+	addr_limit_user_check();
+
+	if (thread_info_flags & _TIF_UPROBE)
+		uprobe_notify_resume(regs);
+
+	if (thread_info_flags & _TIF_PATCH_PENDING)
+		klp_update_patch_state(current);
+
+	if (thread_info_flags & _TIF_SIGPENDING) {
+		BUG_ON(regs != current->thread.regs);
+		do_signal(current);
+	}
+
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(regs);
+		rseq_handle_notify_resume(NULL, regs);
+	}
+
+	user_enter();
+}
+
+unsigned long get_tm_stackpointer(struct task_struct *tsk)
+{
+	/* When in an active transaction that takes a signal, we need to be
+	 * careful with the stack.  It's possible that the stack has moved back
+	 * up after the tbegin.  The obvious case here is when the tbegin is
+	 * called inside a function that returns before a tend.  In this case,
+	 * the stack is part of the checkpointed transactional memory state.
+	 * If we write over this non transactionally or in suspend, we are in
+	 * trouble because if we get a tm abort, the program counter and stack
+	 * pointer will be back at the tbegin but our in memory stack won't be
+	 * valid anymore.
+	 *
+	 * To avoid this, when taking a signal in an active transaction, we
+	 * need to use the stack pointer from the checkpointed state, rather
+	 * than the speculated state.  This ensures that the signal context
+	 * (written tm suspended) will be written below the stack required for
+	 * the rollback.  The transaction is aborted because of the treclaim,
+	 * so any memory written between the tbegin and the signal will be
+	 * rolled back anyway.
+	 *
+	 * For signals taken in non-TM or suspended mode, we use the
+	 * normal/non-checkpointed stack pointer.
+	 */
+
+	unsigned long ret = tsk->thread.regs->gpr[1];
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	BUG_ON(tsk != current);
+
+	if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
+		preempt_disable();
+		tm_reclaim_current(TM_CAUSE_SIGNAL);
+		if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
+			ret = tsk->thread.ckpt_regs.gpr[1];
+
+		/*
+		 * If we treclaim, we must clear the current thread's TM bits
+		 * before re-enabling preemption. Otherwise we might be
+		 * preempted and have the live MSR[TS] changed behind our back
+		 * (tm_recheckpoint_new_task() would recheckpoint). Besides, we
+		 * enter the signal handler in non-transactional state.
+		 */
+		tsk->thread.regs->msr &= ~MSR_TS_MASK;
+		preempt_enable();
+	}
+#endif
+	return ret;
+}
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
new file mode 100644
index 000000000..800433685
--- /dev/null
+++ b/arch/powerpc/kernel/signal.h
@@ -0,0 +1,63 @@
+/*
+ *    Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation
+ *    Extracted from signal_32.c and signal_64.c
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file README.legal in the main directory of
+ * this archive for more details.
+ */
+
+#ifndef _POWERPC_ARCH_SIGNAL_H
+#define _POWERPC_ARCH_SIGNAL_H
+
+extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
+
+extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
+				  size_t frame_size, int is_32);
+
+extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
+			   struct task_struct *tsk);
+
+extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
+			      struct task_struct *tsk);
+
+extern unsigned long copy_fpr_to_user(void __user *to,
+				      struct task_struct *task);
+extern unsigned long copy_ckfpr_to_user(void __user *to,
+					       struct task_struct *task);
+extern unsigned long copy_fpr_from_user(struct task_struct *task,
+					void __user *from);
+extern unsigned long copy_ckfpr_from_user(struct task_struct *task,
+						 void __user *from);
+extern unsigned long get_tm_stackpointer(struct task_struct *tsk);
+
+#ifdef CONFIG_VSX
+extern unsigned long copy_vsx_to_user(void __user *to,
+				      struct task_struct *task);
+extern unsigned long copy_ckvsx_to_user(void __user *to,
+					       struct task_struct *task);
+extern unsigned long copy_vsx_from_user(struct task_struct *task,
+					void __user *from);
+extern unsigned long copy_ckvsx_from_user(struct task_struct *task,
+						 void __user *from);
+#endif
+
+#ifdef CONFIG_PPC64
+
+extern int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
+			      struct task_struct *tsk);
+
+#else /* CONFIG_PPC64 */
+
+extern long sys_rt_sigreturn(void);
+extern long sys_sigreturn(void);
+
+static inline int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
+				     struct task_struct *tsk)
+{
+	return -EFAULT;
+}
+
+#endif /* !defined(CONFIG_PPC64) */
+
+#endif  /* _POWERPC_ARCH_SIGNAL_H */
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
new file mode 100644
index 000000000..06b4b828d
--- /dev/null
+++ b/arch/powerpc/kernel/signal_32.c
@@ -0,0 +1,1516 @@
+/*
+ * Signal handling for 32bit PPC and 32bit tasks on 64bit PPC
+ *
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ * Copyright (C) 2001 IBM
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ *
+ *  Derived from "arch/i386/kernel/signal.c"
+ *    Copyright (C) 1991, 1992 Linus Torvalds
+ *    1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/elf.h>
+#include <linux/ptrace.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/syscalls.h>
+#ifdef CONFIG_PPC64
+#include <linux/compat.h>
+#else
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#endif
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/syscalls.h>
+#include <asm/sigcontext.h>
+#include <asm/vdso.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/asm-prototypes.h>
+#ifdef CONFIG_PPC64
+#include "ppc32.h"
+#include <asm/unistd.h>
+#else
+#include <asm/ucontext.h>
+#include <asm/pgtable.h>
+#endif
+
+#include "signal.h"
+
+
+#ifdef CONFIG_PPC64
+#define old_sigaction	old_sigaction32
+#define sigcontext	sigcontext32
+#define mcontext	mcontext32
+#define ucontext	ucontext32
+
+#define __save_altstack __compat_save_altstack
+
+/*
+ * Userspace code may pass a ucontext which doesn't include VSX added
+ * at the end.  We need to check for this case.
+ */
+#define UCONTEXTSIZEWITHOUTVSX \
+		(sizeof(struct ucontext) - sizeof(elf_vsrreghalf_t32))
+
+/*
+ * Returning 0 means we return to userspace via
+ * ret_from_except and thus restore all user
+ * registers from *regs.  This is what we need
+ * to do when a signal has been delivered.
+ */
+
+#define GP_REGS_SIZE	min(sizeof(elf_gregset_t32), sizeof(struct pt_regs32))
+#undef __SIGNAL_FRAMESIZE
+#define __SIGNAL_FRAMESIZE	__SIGNAL_FRAMESIZE32
+#undef ELF_NVRREG
+#define ELF_NVRREG	ELF_NVRREG32
+
+/*
+ * Functions for flipping sigsets (thanks to brain dead generic
+ * implementation that makes things simple for little endian only)
+ */
+static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set)
+{
+	return put_compat_sigset(uset, set, sizeof(*uset));
+}
+
+static inline int get_sigset_t(sigset_t *set,
+			       const compat_sigset_t __user *uset)
+{
+	return get_compat_sigset(set, uset);
+}
+
+#define to_user_ptr(p)		ptr_to_compat(p)
+#define from_user_ptr(p)	compat_ptr(p)
+
+static inline int save_general_regs(struct pt_regs *regs,
+		struct mcontext __user *frame)
+{
+	elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
+	int i;
+	/* Force usr to alway see softe as 1 (interrupts enabled) */
+	elf_greg_t64 softe = 0x1;
+
+	WARN_ON(!FULL_REGS(regs));
+
+	for (i = 0; i <= PT_RESULT; i ++) {
+		if (i == 14 && !FULL_REGS(regs))
+			i = 32;
+		if ( i == PT_SOFTE) {
+			if(__put_user((unsigned int)softe, &frame->mc_gregs[i]))
+				return -EFAULT;
+			else
+				continue;
+		}
+		if (__put_user((unsigned int)gregs[i], &frame->mc_gregs[i]))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static inline int restore_general_regs(struct pt_regs *regs,
+		struct mcontext __user *sr)
+{
+	elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
+	int i;
+
+	for (i = 0; i <= PT_RESULT; i++) {
+		if ((i == PT_MSR) || (i == PT_SOFTE))
+			continue;
+		if (__get_user(gregs[i], &sr->mc_gregs[i]))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+#else /* CONFIG_PPC64 */
+
+#define GP_REGS_SIZE	min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
+
+static inline int put_sigset_t(sigset_t __user *uset, sigset_t *set)
+{
+	return copy_to_user(uset, set, sizeof(*uset));
+}
+
+static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset)
+{
+	return copy_from_user(set, uset, sizeof(*uset));
+}
+
+#define to_user_ptr(p)		((unsigned long)(p))
+#define from_user_ptr(p)	((void __user *)(p))
+
+static inline int save_general_regs(struct pt_regs *regs,
+		struct mcontext __user *frame)
+{
+	WARN_ON(!FULL_REGS(regs));
+	return __copy_to_user(&frame->mc_gregs, regs, GP_REGS_SIZE);
+}
+
+static inline int restore_general_regs(struct pt_regs *regs,
+		struct mcontext __user *sr)
+{
+	/* copy up to but not including MSR */
+	if (__copy_from_user(regs, &sr->mc_gregs,
+				PT_MSR * sizeof(elf_greg_t)))
+		return -EFAULT;
+	/* copy from orig_r3 (the word after the MSR) up to the end */
+	if (__copy_from_user(&regs->orig_gpr3, &sr->mc_gregs[PT_ORIG_R3],
+				GP_REGS_SIZE - PT_ORIG_R3 * sizeof(elf_greg_t)))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+/*
+ * When we have signals to deliver, we set up on the
+ * user stack, going down from the original stack pointer:
+ *	an ABI gap of 56 words
+ *	an mcontext struct
+ *	a sigcontext struct
+ *	a gap of __SIGNAL_FRAMESIZE bytes
+ *
+ * Each of these things must be a multiple of 16 bytes in size. The following
+ * structure represent all of this except the __SIGNAL_FRAMESIZE gap
+ *
+ */
+struct sigframe {
+	struct sigcontext sctx;		/* the sigcontext */
+	struct mcontext	mctx;		/* all the register values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	struct sigcontext sctx_transact;
+	struct mcontext	mctx_transact;
+#endif
+	/*
+	 * Programs using the rs6000/xcoff abi can save up to 19 gp
+	 * regs and 18 fp regs below sp before decrementing it.
+	 */
+	int			abigap[56];
+};
+
+/* We use the mc_pad field for the signal return trampoline. */
+#define tramp	mc_pad
+
+/*
+ *  When we have rt signals to deliver, we set up on the
+ *  user stack, going down from the original stack pointer:
+ *	one rt_sigframe struct (siginfo + ucontext + ABI gap)
+ *	a gap of __SIGNAL_FRAMESIZE+16 bytes
+ *  (the +16 is to get the siginfo and ucontext in the same
+ *  positions as in older kernels).
+ *
+ *  Each of these things must be a multiple of 16 bytes in size.
+ *
+ */
+struct rt_sigframe {
+#ifdef CONFIG_PPC64
+	compat_siginfo_t info;
+#else
+	struct siginfo info;
+#endif
+	struct ucontext	uc;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	struct ucontext	uc_transact;
+#endif
+	/*
+	 * Programs using the rs6000/xcoff abi can save up to 19 gp
+	 * regs and 18 fp regs below sp before decrementing it.
+	 */
+	int			abigap[56];
+};
+
+#ifdef CONFIG_VSX
+unsigned long copy_fpr_to_user(void __user *to,
+			       struct task_struct *task)
+{
+	u64 buf[ELF_NFPREG];
+	int i;
+
+	/* save FPR copy to local buffer then write to the thread_struct */
+	for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+		buf[i] = task->thread.TS_FPR(i);
+	buf[i] = task->thread.fp_state.fpscr;
+	return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_fpr_from_user(struct task_struct *task,
+				 void __user *from)
+{
+	u64 buf[ELF_NFPREG];
+	int i;
+
+	if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+		return 1;
+	for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+		task->thread.TS_FPR(i) = buf[i];
+	task->thread.fp_state.fpscr = buf[i];
+
+	return 0;
+}
+
+unsigned long copy_vsx_to_user(void __user *to,
+			       struct task_struct *task)
+{
+	u64 buf[ELF_NVSRHALFREG];
+	int i;
+
+	/* save FPR copy to local buffer then write to the thread_struct */
+	for (i = 0; i < ELF_NVSRHALFREG; i++)
+		buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+	return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_vsx_from_user(struct task_struct *task,
+				 void __user *from)
+{
+	u64 buf[ELF_NVSRHALFREG];
+	int i;
+
+	if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+		return 1;
+	for (i = 0; i < ELF_NVSRHALFREG ; i++)
+		task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+	return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+unsigned long copy_ckfpr_to_user(void __user *to,
+				  struct task_struct *task)
+{
+	u64 buf[ELF_NFPREG];
+	int i;
+
+	/* save FPR copy to local buffer then write to the thread_struct */
+	for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+		buf[i] = task->thread.TS_CKFPR(i);
+	buf[i] = task->thread.ckfp_state.fpscr;
+	return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_ckfpr_from_user(struct task_struct *task,
+					  void __user *from)
+{
+	u64 buf[ELF_NFPREG];
+	int i;
+
+	if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+		return 1;
+	for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+		task->thread.TS_CKFPR(i) = buf[i];
+	task->thread.ckfp_state.fpscr = buf[i];
+
+	return 0;
+}
+
+unsigned long copy_ckvsx_to_user(void __user *to,
+				  struct task_struct *task)
+{
+	u64 buf[ELF_NVSRHALFREG];
+	int i;
+
+	/* save FPR copy to local buffer then write to the thread_struct */
+	for (i = 0; i < ELF_NVSRHALFREG; i++)
+		buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
+	return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_ckvsx_from_user(struct task_struct *task,
+					  void __user *from)
+{
+	u64 buf[ELF_NVSRHALFREG];
+	int i;
+
+	if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+		return 1;
+	for (i = 0; i < ELF_NVSRHALFREG ; i++)
+		task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+	return 0;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#else
+inline unsigned long copy_fpr_to_user(void __user *to,
+				      struct task_struct *task)
+{
+	return __copy_to_user(to, task->thread.fp_state.fpr,
+			      ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_fpr_from_user(struct task_struct *task,
+					void __user *from)
+{
+	return __copy_from_user(task->thread.fp_state.fpr, from,
+			      ELF_NFPREG * sizeof(double));
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+inline unsigned long copy_ckfpr_to_user(void __user *to,
+					 struct task_struct *task)
+{
+	return __copy_to_user(to, task->thread.ckfp_state.fpr,
+			      ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
+						 void __user *from)
+{
+	return __copy_from_user(task->thread.ckfp_state.fpr, from,
+				ELF_NFPREG * sizeof(double));
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#endif
+
+/*
+ * Save the current user registers on the user stack.
+ * We only save the altivec/spe registers if the process has used
+ * altivec/spe instructions at some point.
+ */
+static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
+			  struct mcontext __user *tm_frame, int sigret,
+			  int ctx_has_vsx_region)
+{
+	unsigned long msr = regs->msr;
+
+	/* Make sure floating point registers are stored in regs */
+	flush_fp_to_thread(current);
+
+	/* save general registers */
+	if (save_general_regs(regs, frame))
+		return 1;
+
+#ifdef CONFIG_ALTIVEC
+	/* save altivec registers */
+	if (current->thread.used_vr) {
+		flush_altivec_to_thread(current);
+		if (__copy_to_user(&frame->mc_vregs, &current->thread.vr_state,
+				   ELF_NVRREG * sizeof(vector128)))
+			return 1;
+		/* set MSR_VEC in the saved MSR value to indicate that
+		   frame->mc_vregs contains valid data */
+		msr |= MSR_VEC;
+	}
+	/* else assert((regs->msr & MSR_VEC) == 0) */
+
+	/* We always copy to/from vrsave, it's 0 if we don't have or don't
+	 * use altivec. Since VSCR only contains 32 bits saved in the least
+	 * significant bits of a vector, we "cheat" and stuff VRSAVE in the
+	 * most significant bits of that same vector. --BenH
+	 * Note that the current VRSAVE value is in the SPR at this point.
+	 */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		current->thread.vrsave = mfspr(SPRN_VRSAVE);
+	if (__put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32]))
+		return 1;
+#endif /* CONFIG_ALTIVEC */
+	if (copy_fpr_to_user(&frame->mc_fregs, current))
+		return 1;
+
+	/*
+	 * Clear the MSR VSX bit to indicate there is no valid state attached
+	 * to this context, except in the specific case below where we set it.
+	 */
+	msr &= ~MSR_VSX;
+#ifdef CONFIG_VSX
+	/*
+	 * Copy VSR 0-31 upper half from thread_struct to local
+	 * buffer, then write that to userspace.  Also set MSR_VSX in
+	 * the saved MSR value to indicate that frame->mc_vregs
+	 * contains valid data
+	 */
+	if (current->thread.used_vsr && ctx_has_vsx_region) {
+		flush_vsx_to_thread(current);
+		if (copy_vsx_to_user(&frame->mc_vsregs, current))
+			return 1;
+		msr |= MSR_VSX;
+	}
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_SPE
+	/* save spe registers */
+	if (current->thread.used_spe) {
+		flush_spe_to_thread(current);
+		if (__copy_to_user(&frame->mc_vregs, current->thread.evr,
+				   ELF_NEVRREG * sizeof(u32)))
+			return 1;
+		/* set MSR_SPE in the saved MSR value to indicate that
+		   frame->mc_vregs contains valid data */
+		msr |= MSR_SPE;
+	}
+	/* else assert((regs->msr & MSR_SPE) == 0) */
+
+	/* We always copy to/from spefscr */
+	if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG))
+		return 1;
+#endif /* CONFIG_SPE */
+
+	if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
+		return 1;
+	/* We need to write 0 the MSR top 32 bits in the tm frame so that we
+	 * can check it on the restore to see if TM is active
+	 */
+	if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR]))
+		return 1;
+
+	if (sigret) {
+		/* Set up the sigreturn trampoline: li r0,sigret; sc */
+		if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
+		    || __put_user(0x44000002UL, &frame->tramp[1]))
+			return 1;
+		flush_icache_range((unsigned long) &frame->tramp[0],
+				   (unsigned long) &frame->tramp[2]);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save the current user registers on the user stack.
+ * We only save the altivec/spe registers if the process has used
+ * altivec/spe instructions at some point.
+ * We also save the transactional registers to a second ucontext in the
+ * frame.
+ *
+ * See save_user_regs() and signal_64.c:setup_tm_sigcontexts().
+ */
+static int save_tm_user_regs(struct pt_regs *regs,
+			     struct mcontext __user *frame,
+			     struct mcontext __user *tm_frame, int sigret,
+			     unsigned long msr)
+{
+	WARN_ON(tm_suspend_disabled);
+
+	/* Save both sets of general registers */
+	if (save_general_regs(&current->thread.ckpt_regs, frame)
+	    || save_general_regs(regs, tm_frame))
+		return 1;
+
+	/* Stash the top half of the 64bit MSR into the 32bit MSR word
+	 * of the transactional mcontext.  This way we have a backward-compatible
+	 * MSR in the 'normal' (checkpointed) mcontext and additionally one can
+	 * also look at what type of transaction (T or S) was active at the
+	 * time of the signal.
+	 */
+	if (__put_user((msr >> 32), &tm_frame->mc_gregs[PT_MSR]))
+		return 1;
+
+#ifdef CONFIG_ALTIVEC
+	/* save altivec registers */
+	if (current->thread.used_vr) {
+		if (__copy_to_user(&frame->mc_vregs, &current->thread.ckvr_state,
+				   ELF_NVRREG * sizeof(vector128)))
+			return 1;
+		if (msr & MSR_VEC) {
+			if (__copy_to_user(&tm_frame->mc_vregs,
+					   &current->thread.vr_state,
+					   ELF_NVRREG * sizeof(vector128)))
+				return 1;
+		} else {
+			if (__copy_to_user(&tm_frame->mc_vregs,
+					   &current->thread.ckvr_state,
+					   ELF_NVRREG * sizeof(vector128)))
+				return 1;
+		}
+
+		/* set MSR_VEC in the saved MSR value to indicate that
+		 * frame->mc_vregs contains valid data
+		 */
+		msr |= MSR_VEC;
+	}
+
+	/* We always copy to/from vrsave, it's 0 if we don't have or don't
+	 * use altivec. Since VSCR only contains 32 bits saved in the least
+	 * significant bits of a vector, we "cheat" and stuff VRSAVE in the
+	 * most significant bits of that same vector. --BenH
+	 */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		current->thread.ckvrsave = mfspr(SPRN_VRSAVE);
+	if (__put_user(current->thread.ckvrsave,
+		       (u32 __user *)&frame->mc_vregs[32]))
+		return 1;
+	if (msr & MSR_VEC) {
+		if (__put_user(current->thread.vrsave,
+			       (u32 __user *)&tm_frame->mc_vregs[32]))
+			return 1;
+	} else {
+		if (__put_user(current->thread.ckvrsave,
+			       (u32 __user *)&tm_frame->mc_vregs[32]))
+			return 1;
+	}
+#endif /* CONFIG_ALTIVEC */
+
+	if (copy_ckfpr_to_user(&frame->mc_fregs, current))
+		return 1;
+	if (msr & MSR_FP) {
+		if (copy_fpr_to_user(&tm_frame->mc_fregs, current))
+			return 1;
+	} else {
+		if (copy_ckfpr_to_user(&tm_frame->mc_fregs, current))
+			return 1;
+	}
+
+#ifdef CONFIG_VSX
+	/*
+	 * Copy VSR 0-31 upper half from thread_struct to local
+	 * buffer, then write that to userspace.  Also set MSR_VSX in
+	 * the saved MSR value to indicate that frame->mc_vregs
+	 * contains valid data
+	 */
+	if (current->thread.used_vsr) {
+		if (copy_ckvsx_to_user(&frame->mc_vsregs, current))
+			return 1;
+		if (msr & MSR_VSX) {
+			if (copy_vsx_to_user(&tm_frame->mc_vsregs,
+						      current))
+				return 1;
+		} else {
+			if (copy_ckvsx_to_user(&tm_frame->mc_vsregs, current))
+				return 1;
+		}
+
+		msr |= MSR_VSX;
+	}
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_SPE
+	/* SPE regs are not checkpointed with TM, so this section is
+	 * simply the same as in save_user_regs().
+	 */
+	if (current->thread.used_spe) {
+		flush_spe_to_thread(current);
+		if (__copy_to_user(&frame->mc_vregs, current->thread.evr,
+				   ELF_NEVRREG * sizeof(u32)))
+			return 1;
+		/* set MSR_SPE in the saved MSR value to indicate that
+		 * frame->mc_vregs contains valid data */
+		msr |= MSR_SPE;
+	}
+
+	/* We always copy to/from spefscr */
+	if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG))
+		return 1;
+#endif /* CONFIG_SPE */
+
+	if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
+		return 1;
+	if (sigret) {
+		/* Set up the sigreturn trampoline: li r0,sigret; sc */
+		if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
+		    || __put_user(0x44000002UL, &frame->tramp[1]))
+			return 1;
+		flush_icache_range((unsigned long) &frame->tramp[0],
+				   (unsigned long) &frame->tramp[2]);
+	}
+
+	return 0;
+}
+#endif
+
+/*
+ * Restore the current user register values from the user stack,
+ * (except for MSR).
+ */
+static long restore_user_regs(struct pt_regs *regs,
+			      struct mcontext __user *sr, int sig)
+{
+	long err;
+	unsigned int save_r2 = 0;
+	unsigned long msr;
+#ifdef CONFIG_VSX
+	int i;
+#endif
+
+	/*
+	 * restore general registers but not including MSR or SOFTE. Also
+	 * take care of keeping r2 (TLS) intact if not a signal
+	 */
+	if (!sig)
+		save_r2 = (unsigned int)regs->gpr[2];
+	err = restore_general_regs(regs, sr);
+	regs->trap = 0;
+	err |= __get_user(msr, &sr->mc_gregs[PT_MSR]);
+	if (!sig)
+		regs->gpr[2] = (unsigned long) save_r2;
+	if (err)
+		return 1;
+
+	/* if doing signal return, restore the previous little-endian mode */
+	if (sig)
+		regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
+
+#ifdef CONFIG_ALTIVEC
+	/*
+	 * Force the process to reload the altivec registers from
+	 * current->thread when it next does altivec instructions
+	 */
+	regs->msr &= ~MSR_VEC;
+	if (msr & MSR_VEC) {
+		/* restore altivec registers from the stack */
+		if (__copy_from_user(&current->thread.vr_state, &sr->mc_vregs,
+				     sizeof(sr->mc_vregs)))
+			return 1;
+		current->thread.used_vr = true;
+	} else if (current->thread.used_vr)
+		memset(&current->thread.vr_state, 0,
+		       ELF_NVRREG * sizeof(vector128));
+
+	/* Always get VRSAVE back */
+	if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32]))
+		return 1;
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		mtspr(SPRN_VRSAVE, current->thread.vrsave);
+#endif /* CONFIG_ALTIVEC */
+	if (copy_fpr_from_user(current, &sr->mc_fregs))
+		return 1;
+
+#ifdef CONFIG_VSX
+	/*
+	 * Force the process to reload the VSX registers from
+	 * current->thread when it next does VSX instruction.
+	 */
+	regs->msr &= ~MSR_VSX;
+	if (msr & MSR_VSX) {
+		/*
+		 * Restore altivec registers from the stack to a local
+		 * buffer, then write this out to the thread_struct
+		 */
+		if (copy_vsx_from_user(current, &sr->mc_vsregs))
+			return 1;
+		current->thread.used_vsr = true;
+	} else if (current->thread.used_vsr)
+		for (i = 0; i < 32 ; i++)
+			current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+#endif /* CONFIG_VSX */
+	/*
+	 * force the process to reload the FP registers from
+	 * current->thread when it next does FP instructions
+	 */
+	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
+
+#ifdef CONFIG_SPE
+	/* force the process to reload the spe registers from
+	   current->thread when it next does spe instructions */
+	regs->msr &= ~MSR_SPE;
+	if (msr & MSR_SPE) {
+		/* restore spe registers from the stack */
+		if (__copy_from_user(current->thread.evr, &sr->mc_vregs,
+				     ELF_NEVRREG * sizeof(u32)))
+			return 1;
+		current->thread.used_spe = true;
+	} else if (current->thread.used_spe)
+		memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
+
+	/* Always get SPEFSCR back */
+	if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + ELF_NEVRREG))
+		return 1;
+#endif /* CONFIG_SPE */
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Restore the current user register values from the user stack, except for
+ * MSR, and recheckpoint the original checkpointed register state for processes
+ * in transactions.
+ */
+static long restore_tm_user_regs(struct pt_regs *regs,
+				 struct mcontext __user *sr,
+				 struct mcontext __user *tm_sr)
+{
+	long err;
+	unsigned long msr, msr_hi;
+#ifdef CONFIG_VSX
+	int i;
+#endif
+
+	if (tm_suspend_disabled)
+		return 1;
+	/*
+	 * restore general registers but not including MSR or SOFTE. Also
+	 * take care of keeping r2 (TLS) intact if not a signal.
+	 * See comment in signal_64.c:restore_tm_sigcontexts();
+	 * TFHAR is restored from the checkpointed NIP; TEXASR and TFIAR
+	 * were set by the signal delivery.
+	 */
+	err = restore_general_regs(regs, tm_sr);
+	err |= restore_general_regs(&current->thread.ckpt_regs, sr);
+
+	err |= __get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP]);
+
+	err |= __get_user(msr, &sr->mc_gregs[PT_MSR]);
+	if (err)
+		return 1;
+
+	/* Restore the previous little-endian mode */
+	regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
+
+#ifdef CONFIG_ALTIVEC
+	regs->msr &= ~MSR_VEC;
+	if (msr & MSR_VEC) {
+		/* restore altivec registers from the stack */
+		if (__copy_from_user(&current->thread.ckvr_state, &sr->mc_vregs,
+				     sizeof(sr->mc_vregs)) ||
+		    __copy_from_user(&current->thread.vr_state,
+				     &tm_sr->mc_vregs,
+				     sizeof(sr->mc_vregs)))
+			return 1;
+		current->thread.used_vr = true;
+	} else if (current->thread.used_vr) {
+		memset(&current->thread.vr_state, 0,
+		       ELF_NVRREG * sizeof(vector128));
+		memset(&current->thread.ckvr_state, 0,
+		       ELF_NVRREG * sizeof(vector128));
+	}
+
+	/* Always get VRSAVE back */
+	if (__get_user(current->thread.ckvrsave,
+		       (u32 __user *)&sr->mc_vregs[32]) ||
+	    __get_user(current->thread.vrsave,
+		       (u32 __user *)&tm_sr->mc_vregs[32]))
+		return 1;
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		mtspr(SPRN_VRSAVE, current->thread.ckvrsave);
+#endif /* CONFIG_ALTIVEC */
+
+	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
+
+	if (copy_fpr_from_user(current, &sr->mc_fregs) ||
+	    copy_ckfpr_from_user(current, &tm_sr->mc_fregs))
+		return 1;
+
+#ifdef CONFIG_VSX
+	regs->msr &= ~MSR_VSX;
+	if (msr & MSR_VSX) {
+		/*
+		 * Restore altivec registers from the stack to a local
+		 * buffer, then write this out to the thread_struct
+		 */
+		if (copy_vsx_from_user(current, &tm_sr->mc_vsregs) ||
+		    copy_ckvsx_from_user(current, &sr->mc_vsregs))
+			return 1;
+		current->thread.used_vsr = true;
+	} else if (current->thread.used_vsr)
+		for (i = 0; i < 32 ; i++) {
+			current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+			current->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+		}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_SPE
+	/* SPE regs are not checkpointed with TM, so this section is
+	 * simply the same as in restore_user_regs().
+	 */
+	regs->msr &= ~MSR_SPE;
+	if (msr & MSR_SPE) {
+		if (__copy_from_user(current->thread.evr, &sr->mc_vregs,
+				     ELF_NEVRREG * sizeof(u32)))
+			return 1;
+		current->thread.used_spe = true;
+	} else if (current->thread.used_spe)
+		memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
+
+	/* Always get SPEFSCR back */
+	if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs
+		       + ELF_NEVRREG))
+		return 1;
+#endif /* CONFIG_SPE */
+
+	/* Get the top half of the MSR from the user context */
+	if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR]))
+		return 1;
+	msr_hi <<= 32;
+	/* If TM bits are set to the reserved value, it's an invalid context */
+	if (MSR_TM_RESV(msr_hi))
+		return 1;
+
+	/*
+	 * Disabling preemption, since it is unsafe to be preempted
+	 * with MSR[TS] set without recheckpointing.
+	 */
+	preempt_disable();
+
+	/*
+	 * CAUTION:
+	 * After regs->MSR[TS] being updated, make sure that get_user(),
+	 * put_user() or similar functions are *not* called. These
+	 * functions can generate page faults which will cause the process
+	 * to be de-scheduled with MSR[TS] set but without calling
+	 * tm_recheckpoint(). This can cause a bug.
+	 *
+	 * Pull in the MSR TM bits from the user context
+	 */
+	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK);
+	/* Now, recheckpoint.  This loads up all of the checkpointed (older)
+	 * registers, including FP and V[S]Rs.  After recheckpointing, the
+	 * transactional versions should be loaded.
+	 */
+	tm_enable();
+	/* Make sure the transaction is marked as failed */
+	current->thread.tm_texasr |= TEXASR_FS;
+	/* This loads the checkpointed FP/VEC state, if used */
+	tm_recheckpoint(&current->thread);
+
+	/* This loads the speculative FP/VEC state, if used */
+	msr_check_and_set(msr & (MSR_FP | MSR_VEC));
+	if (msr & MSR_FP) {
+		load_fp_state(&current->thread.fp_state);
+		regs->msr |= (MSR_FP | current->thread.fpexc_mode);
+	}
+#ifdef CONFIG_ALTIVEC
+	if (msr & MSR_VEC) {
+		load_vr_state(&current->thread.vr_state);
+		regs->msr |= MSR_VEC;
+	}
+#endif
+
+	preempt_enable();
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PPC64
+
+#define copy_siginfo_to_user	copy_siginfo_to_user32
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Set up a signal frame for a "real-time" signal handler
+ * (one which gets siginfo).
+ */
+int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
+		       struct task_struct *tsk)
+{
+	struct rt_sigframe __user *rt_sf;
+	struct mcontext __user *frame;
+	struct mcontext __user *tm_frame = NULL;
+	void __user *addr;
+	unsigned long newsp = 0;
+	int sigret;
+	unsigned long tramp;
+	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
+
+	BUG_ON(tsk != current);
+
+	/* Set up Signal Frame */
+	/* Put a Real Time Context onto stack */
+	rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
+	addr = rt_sf;
+	if (unlikely(rt_sf == NULL))
+		goto badframe;
+
+	/* Put the siginfo & fill in most of the ucontext */
+	if (copy_siginfo_to_user(&rt_sf->info, &ksig->info)
+	    || __put_user(0, &rt_sf->uc.uc_flags)
+	    || __save_altstack(&rt_sf->uc.uc_stack, regs->gpr[1])
+	    || __put_user(to_user_ptr(&rt_sf->uc.uc_mcontext),
+		    &rt_sf->uc.uc_regs)
+	    || put_sigset_t(&rt_sf->uc.uc_sigmask, oldset))
+		goto badframe;
+
+	/* Save user registers on the stack */
+	frame = &rt_sf->uc.uc_mcontext;
+	addr = frame;
+	if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
+		sigret = 0;
+		tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
+	} else {
+		sigret = __NR_rt_sigreturn;
+		tramp = (unsigned long) frame->tramp;
+	}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	tm_frame = &rt_sf->uc_transact.uc_mcontext;
+	if (MSR_TM_ACTIVE(msr)) {
+		if (__put_user((unsigned long)&rt_sf->uc_transact,
+			       &rt_sf->uc.uc_link) ||
+		    __put_user((unsigned long)tm_frame,
+			       &rt_sf->uc_transact.uc_regs))
+			goto badframe;
+		if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr))
+			goto badframe;
+	}
+	else
+#endif
+	{
+		if (__put_user(0, &rt_sf->uc.uc_link))
+			goto badframe;
+		if (save_user_regs(regs, frame, tm_frame, sigret, 1))
+			goto badframe;
+	}
+	regs->link = tramp;
+
+	tsk->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
+
+	/* create a stack frame for the caller of the handler */
+	newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
+	addr = (void __user *)regs->gpr[1];
+	if (put_user(regs->gpr[1], (u32 __user *)newsp))
+		goto badframe;
+
+	/* Fill registers for signal handler */
+	regs->gpr[1] = newsp;
+	regs->gpr[3] = ksig->sig;
+	regs->gpr[4] = (unsigned long) &rt_sf->info;
+	regs->gpr[5] = (unsigned long) &rt_sf->uc;
+	regs->gpr[6] = (unsigned long) rt_sf;
+	regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
+	/* enter the signal handler in native-endian mode */
+	regs->msr &= ~MSR_LE;
+	regs->msr |= (MSR_KERNEL & MSR_LE);
+	return 0;
+
+badframe:
+	if (show_unhandled_signals)
+		printk_ratelimited(KERN_INFO
+				   "%s[%d]: bad frame in handle_rt_signal32: "
+				   "%p nip %08lx lr %08lx\n",
+				   tsk->comm, tsk->pid,
+				   addr, regs->nip, regs->link);
+
+	return 1;
+}
+
+static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int sig)
+{
+	sigset_t set;
+	struct mcontext __user *mcp;
+
+	if (get_sigset_t(&set, &ucp->uc_sigmask))
+		return -EFAULT;
+#ifdef CONFIG_PPC64
+	{
+		u32 cmcp;
+
+		if (__get_user(cmcp, &ucp->uc_regs))
+			return -EFAULT;
+		mcp = (struct mcontext __user *)(u64)cmcp;
+		/* no need to check access_ok(mcp), since mcp < 4GB */
+	}
+#else
+	if (__get_user(mcp, &ucp->uc_regs))
+		return -EFAULT;
+	if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp)))
+		return -EFAULT;
+#endif
+	set_current_blocked(&set);
+	if (restore_user_regs(regs, mcp, sig))
+		return -EFAULT;
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static int do_setcontext_tm(struct ucontext __user *ucp,
+			    struct ucontext __user *tm_ucp,
+			    struct pt_regs *regs)
+{
+	sigset_t set;
+	struct mcontext __user *mcp;
+	struct mcontext __user *tm_mcp;
+	u32 cmcp;
+	u32 tm_cmcp;
+
+	if (get_sigset_t(&set, &ucp->uc_sigmask))
+		return -EFAULT;
+
+	if (__get_user(cmcp, &ucp->uc_regs) ||
+	    __get_user(tm_cmcp, &tm_ucp->uc_regs))
+		return -EFAULT;
+	mcp = (struct mcontext __user *)(u64)cmcp;
+	tm_mcp = (struct mcontext __user *)(u64)tm_cmcp;
+	/* no need to check access_ok(mcp), since mcp < 4GB */
+
+	set_current_blocked(&set);
+	if (restore_tm_user_regs(regs, mcp, tm_mcp))
+		return -EFAULT;
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PPC64
+COMPAT_SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
+		       struct ucontext __user *, new_ctx, int, ctx_size)
+#else
+SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
+		       struct ucontext __user *, new_ctx, long, ctx_size)
+#endif
+{
+	struct pt_regs *regs = current_pt_regs();
+	int ctx_has_vsx_region = 0;
+
+#ifdef CONFIG_PPC64
+	unsigned long new_msr = 0;
+
+	if (new_ctx) {
+		struct mcontext __user *mcp;
+		u32 cmcp;
+
+		/*
+		 * Get pointer to the real mcontext.  No need for
+		 * access_ok since we are dealing with compat
+		 * pointers.
+		 */
+		if (__get_user(cmcp, &new_ctx->uc_regs))
+			return -EFAULT;
+		mcp = (struct mcontext __user *)(u64)cmcp;
+		if (__get_user(new_msr, &mcp->mc_gregs[PT_MSR]))
+			return -EFAULT;
+	}
+	/*
+	 * Check that the context is not smaller than the original
+	 * size (with VMX but without VSX)
+	 */
+	if (ctx_size < UCONTEXTSIZEWITHOUTVSX)
+		return -EINVAL;
+	/*
+	 * If the new context state sets the MSR VSX bits but
+	 * it doesn't provide VSX state.
+	 */
+	if ((ctx_size < sizeof(struct ucontext)) &&
+	    (new_msr & MSR_VSX))
+		return -EINVAL;
+	/* Does the context have enough room to store VSX data? */
+	if (ctx_size >= sizeof(struct ucontext))
+		ctx_has_vsx_region = 1;
+#else
+	/* Context size is for future use. Right now, we only make sure
+	 * we are passed something we understand
+	 */
+	if (ctx_size < sizeof(struct ucontext))
+		return -EINVAL;
+#endif
+	if (old_ctx != NULL) {
+		struct mcontext __user *mctx;
+
+		/*
+		 * old_ctx might not be 16-byte aligned, in which
+		 * case old_ctx->uc_mcontext won't be either.
+		 * Because we have the old_ctx->uc_pad2 field
+		 * before old_ctx->uc_mcontext, we need to round down
+		 * from &old_ctx->uc_mcontext to a 16-byte boundary.
+		 */
+		mctx = (struct mcontext __user *)
+			((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
+		if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
+		    || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
+		    || put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
+		    || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
+			return -EFAULT;
+	}
+	if (new_ctx == NULL)
+		return 0;
+	if (!access_ok(VERIFY_READ, new_ctx, ctx_size) ||
+	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
+		return -EFAULT;
+
+	/*
+	 * If we get a fault copying the context into the kernel's
+	 * image of the user's registers, we can't just return -EFAULT
+	 * because the user's registers will be corrupted.  For instance
+	 * the NIP value may have been updated but not some of the
+	 * other registers.  Given that we have done the access_ok
+	 * and successfully read the first and last bytes of the region
+	 * above, this should only happen in an out-of-memory situation
+	 * or if another thread unmaps the region containing the context.
+	 * We kill the task with a SIGSEGV in this situation.
+	 */
+	if (do_setcontext(new_ctx, regs, 0))
+		do_exit(SIGSEGV);
+
+	set_thread_flag(TIF_RESTOREALL);
+	return 0;
+}
+
+#ifdef CONFIG_PPC64
+COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
+#else
+SYSCALL_DEFINE0(rt_sigreturn)
+#endif
+{
+	struct rt_sigframe __user *rt_sf;
+	struct pt_regs *regs = current_pt_regs();
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	struct ucontext __user *uc_transact;
+	unsigned long msr_hi;
+	unsigned long tmp;
+	int tm_restore = 0;
+#endif
+	/* Always make any pending restarted system calls return -EINTR */
+	current->restart_block.fn = do_no_restart_syscall;
+
+	rt_sf = (struct rt_sigframe __user *)
+		(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
+	if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf)))
+		goto bad;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * If there is a transactional state then throw it away.
+	 * The purpose of a sigreturn is to destroy all traces of the
+	 * signal frame, this includes any transactional state created
+	 * within in. We only check for suspended as we can never be
+	 * active in the kernel, we are active, there is nothing better to
+	 * do than go ahead and Bad Thing later.
+	 * The cause is not important as there will never be a
+	 * recheckpoint so it's not user visible.
+	 */
+	if (MSR_TM_SUSPENDED(mfmsr()))
+		tm_reclaim_current(0);
+
+	if (__get_user(tmp, &rt_sf->uc.uc_link))
+		goto bad;
+	uc_transact = (struct ucontext __user *)(uintptr_t)tmp;
+	if (uc_transact) {
+		u32 cmcp;
+		struct mcontext __user *mcp;
+
+		if (__get_user(cmcp, &uc_transact->uc_regs))
+			return -EFAULT;
+		mcp = (struct mcontext __user *)(u64)cmcp;
+		/* The top 32 bits of the MSR are stashed in the transactional
+		 * ucontext. */
+		if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR]))
+			goto bad;
+
+		if (MSR_TM_ACTIVE(msr_hi<<32)) {
+			/* Trying to start TM on non TM system */
+			if (!cpu_has_feature(CPU_FTR_TM))
+				goto bad;
+			/* We only recheckpoint on return if we're
+			 * transaction.
+			 */
+			tm_restore = 1;
+			if (do_setcontext_tm(&rt_sf->uc, uc_transact, regs))
+				goto bad;
+		}
+	}
+	if (!tm_restore)
+		/* Fall through, for non-TM restore */
+#endif
+	if (do_setcontext(&rt_sf->uc, regs, 1))
+		goto bad;
+
+	/*
+	 * It's not clear whether or why it is desirable to save the
+	 * sigaltstack setting on signal delivery and restore it on
+	 * signal return.  But other architectures do this and we have
+	 * always done it up until now so it is probably better not to
+	 * change it.  -- paulus
+	 */
+#ifdef CONFIG_PPC64
+	if (compat_restore_altstack(&rt_sf->uc.uc_stack))
+		goto bad;
+#else
+	if (restore_altstack(&rt_sf->uc.uc_stack))
+		goto bad;
+#endif
+	set_thread_flag(TIF_RESTOREALL);
+	return 0;
+
+ bad:
+	if (show_unhandled_signals)
+		printk_ratelimited(KERN_INFO
+				   "%s[%d]: bad frame in sys_rt_sigreturn: "
+				   "%p nip %08lx lr %08lx\n",
+				   current->comm, current->pid,
+				   rt_sf, regs->nip, regs->link);
+
+	force_sig(SIGSEGV, current);
+	return 0;
+}
+
+#ifdef CONFIG_PPC32
+SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
+			 int, ndbg, struct sig_dbg_op __user *, dbg)
+{
+	struct pt_regs *regs = current_pt_regs();
+	struct sig_dbg_op op;
+	int i;
+	unsigned long new_msr = regs->msr;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	unsigned long new_dbcr0 = current->thread.debug.dbcr0;
+#endif
+
+	for (i=0; i<ndbg; i++) {
+		if (copy_from_user(&op, dbg + i, sizeof(op)))
+			return -EFAULT;
+		switch (op.dbg_type) {
+		case SIG_DBG_SINGLE_STEPPING:
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+			if (op.dbg_value) {
+				new_msr |= MSR_DE;
+				new_dbcr0 |= (DBCR0_IDM | DBCR0_IC);
+			} else {
+				new_dbcr0 &= ~DBCR0_IC;
+				if (!DBCR_ACTIVE_EVENTS(new_dbcr0,
+						current->thread.debug.dbcr1)) {
+					new_msr &= ~MSR_DE;
+					new_dbcr0 &= ~DBCR0_IDM;
+				}
+			}
+#else
+			if (op.dbg_value)
+				new_msr |= MSR_SE;
+			else
+				new_msr &= ~MSR_SE;
+#endif
+			break;
+		case SIG_DBG_BRANCH_TRACING:
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+			return -EINVAL;
+#else
+			if (op.dbg_value)
+				new_msr |= MSR_BE;
+			else
+				new_msr &= ~MSR_BE;
+#endif
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* We wait until here to actually install the values in the
+	   registers so if we fail in the above loop, it will not
+	   affect the contents of these registers.  After this point,
+	   failure is a problem, anyway, and it's very unlikely unless
+	   the user is really doing something wrong. */
+	regs->msr = new_msr;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	current->thread.debug.dbcr0 = new_dbcr0;
+#endif
+
+	if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) ||
+	    fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
+		return -EFAULT;
+
+	/*
+	 * If we get a fault copying the context into the kernel's
+	 * image of the user's registers, we can't just return -EFAULT
+	 * because the user's registers will be corrupted.  For instance
+	 * the NIP value may have been updated but not some of the
+	 * other registers.  Given that we have done the access_ok
+	 * and successfully read the first and last bytes of the region
+	 * above, this should only happen in an out-of-memory situation
+	 * or if another thread unmaps the region containing the context.
+	 * We kill the task with a SIGSEGV in this situation.
+	 */
+	if (do_setcontext(ctx, regs, 1)) {
+		if (show_unhandled_signals)
+			printk_ratelimited(KERN_INFO "%s[%d]: bad frame in "
+					   "sys_debug_setcontext: %p nip %08lx "
+					   "lr %08lx\n",
+					   current->comm, current->pid,
+					   ctx, regs->nip, regs->link);
+
+		force_sig(SIGSEGV, current);
+		goto out;
+	}
+
+	/*
+	 * It's not clear whether or why it is desirable to save the
+	 * sigaltstack setting on signal delivery and restore it on
+	 * signal return.  But other architectures do this and we have
+	 * always done it up until now so it is probably better not to
+	 * change it.  -- paulus
+	 */
+	restore_altstack(&ctx->uc_stack);
+
+	set_thread_flag(TIF_RESTOREALL);
+ out:
+	return 0;
+}
+#endif
+
+/*
+ * OK, we're invoking a handler
+ */
+int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
+		struct task_struct *tsk)
+{
+	struct sigcontext __user *sc;
+	struct sigframe __user *frame;
+	struct mcontext __user *tm_mctx = NULL;
+	unsigned long newsp = 0;
+	int sigret;
+	unsigned long tramp;
+	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
+
+	BUG_ON(tsk != current);
+
+	/* Set up Signal Frame */
+	frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
+	if (unlikely(frame == NULL))
+		goto badframe;
+	sc = (struct sigcontext __user *) &frame->sctx;
+
+#if _NSIG != 64
+#error "Please adjust handle_signal()"
+#endif
+	if (__put_user(to_user_ptr(ksig->ka.sa.sa_handler), &sc->handler)
+	    || __put_user(oldset->sig[0], &sc->oldmask)
+#ifdef CONFIG_PPC64
+	    || __put_user((oldset->sig[0] >> 32), &sc->_unused[3])
+#else
+	    || __put_user(oldset->sig[1], &sc->_unused[3])
+#endif
+	    || __put_user(to_user_ptr(&frame->mctx), &sc->regs)
+	    || __put_user(ksig->sig, &sc->signal))
+		goto badframe;
+
+	if (vdso32_sigtramp && tsk->mm->context.vdso_base) {
+		sigret = 0;
+		tramp = tsk->mm->context.vdso_base + vdso32_sigtramp;
+	} else {
+		sigret = __NR_sigreturn;
+		tramp = (unsigned long) frame->mctx.tramp;
+	}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	tm_mctx = &frame->mctx_transact;
+	if (MSR_TM_ACTIVE(msr)) {
+		if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
+				      sigret, msr))
+			goto badframe;
+	}
+	else
+#endif
+	{
+		if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1))
+			goto badframe;
+	}
+
+	regs->link = tramp;
+
+	tsk->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
+
+	/* create a stack frame for the caller of the handler */
+	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
+	if (put_user(regs->gpr[1], (u32 __user *)newsp))
+		goto badframe;
+
+	regs->gpr[1] = newsp;
+	regs->gpr[3] = ksig->sig;
+	regs->gpr[4] = (unsigned long) sc;
+	regs->nip = (unsigned long) (unsigned long)ksig->ka.sa.sa_handler;
+	/* enter the signal handler in big-endian mode */
+	regs->msr &= ~MSR_LE;
+	return 0;
+
+badframe:
+	if (show_unhandled_signals)
+		printk_ratelimited(KERN_INFO
+				   "%s[%d]: bad frame in handle_signal32: "
+				   "%p nip %08lx lr %08lx\n",
+				   tsk->comm, tsk->pid,
+				   frame, regs->nip, regs->link);
+
+	return 1;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+#ifdef CONFIG_PPC64
+COMPAT_SYSCALL_DEFINE0(sigreturn)
+#else
+SYSCALL_DEFINE0(sigreturn)
+#endif
+{
+	struct pt_regs *regs = current_pt_regs();
+	struct sigframe __user *sf;
+	struct sigcontext __user *sc;
+	struct sigcontext sigctx;
+	struct mcontext __user *sr;
+	void __user *addr;
+	sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	struct mcontext __user *mcp, *tm_mcp;
+	unsigned long msr_hi;
+#endif
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current->restart_block.fn = do_no_restart_syscall;
+
+	sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+	sc = &sf->sctx;
+	addr = sc;
+	if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
+		goto badframe;
+
+#ifdef CONFIG_PPC64
+	/*
+	 * Note that PPC32 puts the upper 32 bits of the sigmask in the
+	 * unused part of the signal stackframe
+	 */
+	set.sig[0] = sigctx.oldmask + ((long)(sigctx._unused[3]) << 32);
+#else
+	set.sig[0] = sigctx.oldmask;
+	set.sig[1] = sigctx._unused[3];
+#endif
+	set_current_blocked(&set);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	mcp = (struct mcontext __user *)&sf->mctx;
+	tm_mcp = (struct mcontext __user *)&sf->mctx_transact;
+	if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR]))
+		goto badframe;
+	if (MSR_TM_ACTIVE(msr_hi<<32)) {
+		if (!cpu_has_feature(CPU_FTR_TM))
+			goto badframe;
+		if (restore_tm_user_regs(regs, mcp, tm_mcp))
+			goto badframe;
+	} else
+#endif
+	{
+		sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
+		addr = sr;
+		if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+		    || restore_user_regs(regs, sr, 1))
+			goto badframe;
+	}
+
+	set_thread_flag(TIF_RESTOREALL);
+	return 0;
+
+badframe:
+	if (show_unhandled_signals)
+		printk_ratelimited(KERN_INFO
+				   "%s[%d]: bad frame in sys_sigreturn: "
+				   "%p nip %08lx lr %08lx\n",
+				   current->comm, current->pid,
+				   addr, regs->nip, regs->link);
+
+	force_sig(SIGSEGV, current);
+	return 0;
+}
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
new file mode 100644
index 000000000..7aba592bb
--- /dev/null
+++ b/arch/powerpc/kernel/signal_64.c
@@ -0,0 +1,903 @@
+/*
+ *  PowerPC version 
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/i386/kernel/signal.c"
+ *    Copyright (C) 1991, 1992 Linus Torvalds
+ *    1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/elf.h>
+#include <linux/ptrace.h>
+#include <linux/ratelimit.h>
+#include <linux/syscalls.h>
+
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/cacheflush.h>
+#include <asm/syscalls.h>
+#include <asm/vdso.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/asm-prototypes.h>
+
+#include "signal.h"
+
+
+#define GP_REGS_SIZE	min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
+#define FP_REGS_SIZE	sizeof(elf_fpregset_t)
+
+#define TRAMP_TRACEBACK	3
+#define TRAMP_SIZE	6
+
+/*
+ * When we have signals to deliver, we set up on the user stack,
+ * going down from the original stack pointer:
+ *	1) a rt_sigframe struct which contains the ucontext	
+ *	2) a gap of __SIGNAL_FRAMESIZE bytes which acts as a dummy caller
+ *	   frame for the signal handler.
+ */
+
+struct rt_sigframe {
+	/* sys_rt_sigreturn requires the ucontext be the first field */
+	struct ucontext uc;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	struct ucontext uc_transact;
+#endif
+	unsigned long _unused[2];
+	unsigned int tramp[TRAMP_SIZE];
+	struct siginfo __user *pinfo;
+	void __user *puc;
+	struct siginfo info;
+	/* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
+	char abigap[USER_REDZONE_SIZE];
+} __attribute__ ((aligned (16)));
+
+static const char fmt32[] = KERN_INFO \
+	"%s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx\n";
+static const char fmt64[] = KERN_INFO \
+	"%s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx\n";
+
+/*
+ * This computes a quad word aligned pointer inside the vmx_reserve array
+ * element. For historical reasons sigcontext might not be quad word aligned,
+ * but the location we write the VMX regs to must be. See the comment in
+ * sigcontext for more detail.
+ */
+#ifdef CONFIG_ALTIVEC
+static elf_vrreg_t __user *sigcontext_vmx_regs(struct sigcontext __user *sc)
+{
+	return (elf_vrreg_t __user *) (((unsigned long)sc->vmx_reserve + 15) & ~0xful);
+}
+#endif
+
+/*
+ * Set up the sigcontext for the signal frame.
+ */
+
+static long setup_sigcontext(struct sigcontext __user *sc,
+		struct task_struct *tsk, int signr, sigset_t *set,
+		unsigned long handler, int ctx_has_vsx_region)
+{
+	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
+	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
+	 * the context). This is very important because we must ensure we
+	 * don't lose the VRSAVE content that may have been set prior to
+	 * the process doing its first vector operation
+	 * Userland shall check AT_HWCAP to know whether it can rely on the
+	 * v_regs pointer or not
+	 */
+#ifdef CONFIG_ALTIVEC
+	elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
+	unsigned long vrsave;
+#endif
+	struct pt_regs *regs = tsk->thread.regs;
+	unsigned long msr = regs->msr;
+	long err = 0;
+	/* Force usr to alway see softe as 1 (interrupts enabled) */
+	unsigned long softe = 0x1;
+
+	BUG_ON(tsk != current);
+
+#ifdef CONFIG_ALTIVEC
+	err |= __put_user(v_regs, &sc->v_regs);
+
+	/* save altivec registers */
+	if (tsk->thread.used_vr) {
+		flush_altivec_to_thread(tsk);
+		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
+		err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
+				      33 * sizeof(vector128));
+		/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
+		 * contains valid data.
+		 */
+		msr |= MSR_VEC;
+	}
+	/* We always copy to/from vrsave, it's 0 if we don't have or don't
+	 * use altivec.
+	 */
+	vrsave = 0;
+	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
+		vrsave = mfspr(SPRN_VRSAVE);
+		tsk->thread.vrsave = vrsave;
+	}
+
+	err |= __put_user(vrsave, (u32 __user *)&v_regs[33]);
+#else /* CONFIG_ALTIVEC */
+	err |= __put_user(0, &sc->v_regs);
+#endif /* CONFIG_ALTIVEC */
+	flush_fp_to_thread(tsk);
+	/* copy fpr regs and fpscr */
+	err |= copy_fpr_to_user(&sc->fp_regs, tsk);
+
+	/*
+	 * Clear the MSR VSX bit to indicate there is no valid state attached
+	 * to this context, except in the specific case below where we set it.
+	 */
+	msr &= ~MSR_VSX;
+#ifdef CONFIG_VSX
+	/*
+	 * Copy VSX low doubleword to local buffer for formatting,
+	 * then out to userspace.  Update v_regs to point after the
+	 * VMX data.
+	 */
+	if (tsk->thread.used_vsr && ctx_has_vsx_region) {
+		flush_vsx_to_thread(tsk);
+		v_regs += ELF_NVRREG;
+		err |= copy_vsx_to_user(v_regs, tsk);
+		/* set MSR_VSX in the MSR value in the frame to
+		 * indicate that sc->vs_reg) contains valid data.
+		 */
+		msr |= MSR_VSX;
+	}
+#endif /* CONFIG_VSX */
+	err |= __put_user(&sc->gp_regs, &sc->regs);
+	WARN_ON(!FULL_REGS(regs));
+	err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE);
+	err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
+	err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]);
+	err |= __put_user(signr, &sc->signal);
+	err |= __put_user(handler, &sc->handler);
+	if (set != NULL)
+		err |=  __put_user(set->sig[0], &sc->oldmask);
+
+	return err;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * As above, but Transactional Memory is in use, so deliver sigcontexts
+ * containing checkpointed and transactional register states.
+ *
+ * To do this, we treclaim (done before entering here) to gather both sets of
+ * registers and set up the 'normal' sigcontext registers with rolled-back
+ * register values such that a simple signal handler sees a correct
+ * checkpointed register state.  If interested, a TM-aware sighandler can
+ * examine the transactional registers in the 2nd sigcontext to determine the
+ * real origin of the signal.
+ */
+static long setup_tm_sigcontexts(struct sigcontext __user *sc,
+				 struct sigcontext __user *tm_sc,
+				 struct task_struct *tsk,
+				 int signr, sigset_t *set, unsigned long handler,
+				 unsigned long msr)
+{
+	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
+	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
+	 * the context). This is very important because we must ensure we
+	 * don't lose the VRSAVE content that may have been set prior to
+	 * the process doing its first vector operation
+	 * Userland shall check AT_HWCAP to know wether it can rely on the
+	 * v_regs pointer or not.
+	 */
+#ifdef CONFIG_ALTIVEC
+	elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
+	elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc);
+#endif
+	struct pt_regs *regs = tsk->thread.regs;
+	long err = 0;
+
+	BUG_ON(tsk != current);
+
+	BUG_ON(!MSR_TM_ACTIVE(msr));
+
+	WARN_ON(tm_suspend_disabled);
+
+	/* Restore checkpointed FP, VEC, and VSX bits from ckpt_regs as
+	 * it contains the correct FP, VEC, VSX state after we treclaimed
+	 * the transaction and giveup_all() was called on reclaiming.
+	 */
+	msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX);
+
+#ifdef CONFIG_ALTIVEC
+	err |= __put_user(v_regs, &sc->v_regs);
+	err |= __put_user(tm_v_regs, &tm_sc->v_regs);
+
+	/* save altivec registers */
+	if (tsk->thread.used_vr) {
+		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
+		err |= __copy_to_user(v_regs, &tsk->thread.ckvr_state,
+				      33 * sizeof(vector128));
+		/* If VEC was enabled there are transactional VRs valid too,
+		 * else they're a copy of the checkpointed VRs.
+		 */
+		if (msr & MSR_VEC)
+			err |= __copy_to_user(tm_v_regs,
+					      &tsk->thread.vr_state,
+					      33 * sizeof(vector128));
+		else
+			err |= __copy_to_user(tm_v_regs,
+					      &tsk->thread.ckvr_state,
+					      33 * sizeof(vector128));
+
+		/* set MSR_VEC in the MSR value in the frame to indicate
+		 * that sc->v_reg contains valid data.
+		 */
+		msr |= MSR_VEC;
+	}
+	/* We always copy to/from vrsave, it's 0 if we don't have or don't
+	 * use altivec.
+	 */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		tsk->thread.ckvrsave = mfspr(SPRN_VRSAVE);
+	err |= __put_user(tsk->thread.ckvrsave, (u32 __user *)&v_regs[33]);
+	if (msr & MSR_VEC)
+		err |= __put_user(tsk->thread.vrsave,
+				  (u32 __user *)&tm_v_regs[33]);
+	else
+		err |= __put_user(tsk->thread.ckvrsave,
+				  (u32 __user *)&tm_v_regs[33]);
+
+#else /* CONFIG_ALTIVEC */
+	err |= __put_user(0, &sc->v_regs);
+	err |= __put_user(0, &tm_sc->v_regs);
+#endif /* CONFIG_ALTIVEC */
+
+	/* copy fpr regs and fpscr */
+	err |= copy_ckfpr_to_user(&sc->fp_regs, tsk);
+	if (msr & MSR_FP)
+		err |= copy_fpr_to_user(&tm_sc->fp_regs, tsk);
+	else
+		err |= copy_ckfpr_to_user(&tm_sc->fp_regs, tsk);
+
+#ifdef CONFIG_VSX
+	/*
+	 * Copy VSX low doubleword to local buffer for formatting,
+	 * then out to userspace.  Update v_regs to point after the
+	 * VMX data.
+	 */
+	if (tsk->thread.used_vsr) {
+		v_regs += ELF_NVRREG;
+		tm_v_regs += ELF_NVRREG;
+
+		err |= copy_ckvsx_to_user(v_regs, tsk);
+
+		if (msr & MSR_VSX)
+			err |= copy_vsx_to_user(tm_v_regs, tsk);
+		else
+			err |= copy_ckvsx_to_user(tm_v_regs, tsk);
+
+		/* set MSR_VSX in the MSR value in the frame to
+		 * indicate that sc->vs_reg) contains valid data.
+		 */
+		msr |= MSR_VSX;
+	}
+#endif /* CONFIG_VSX */
+
+	err |= __put_user(&sc->gp_regs, &sc->regs);
+	err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs);
+	WARN_ON(!FULL_REGS(regs));
+	err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE);
+	err |= __copy_to_user(&sc->gp_regs,
+			      &tsk->thread.ckpt_regs, GP_REGS_SIZE);
+	err |= __put_user(msr, &tm_sc->gp_regs[PT_MSR]);
+	err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
+	err |= __put_user(signr, &sc->signal);
+	err |= __put_user(handler, &sc->handler);
+	if (set != NULL)
+		err |=  __put_user(set->sig[0], &sc->oldmask);
+
+	return err;
+}
+#endif
+
+/*
+ * Restore the sigcontext from the signal frame.
+ */
+
+static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
+			      struct sigcontext __user *sc)
+{
+#ifdef CONFIG_ALTIVEC
+	elf_vrreg_t __user *v_regs;
+#endif
+	unsigned long err = 0;
+	unsigned long save_r13 = 0;
+	unsigned long msr;
+	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_VSX
+	int i;
+#endif
+
+	BUG_ON(tsk != current);
+
+	/* If this is not a signal return, we preserve the TLS in r13 */
+	if (!sig)
+		save_r13 = regs->gpr[13];
+
+	/* copy the GPRs */
+	err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr));
+	err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]);
+	/* get MSR separately, transfer the LE bit if doing signal return */
+	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+	if (sig)
+		regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
+	err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]);
+	err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]);
+	err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]);
+	err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]);
+	err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]);
+	/* skip SOFTE */
+	regs->trap = 0;
+	err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
+	err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
+	err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
+
+	if (!sig)
+		regs->gpr[13] = save_r13;
+	if (set != NULL)
+		err |=  __get_user(set->sig[0], &sc->oldmask);
+
+	/*
+	 * Force reload of FP/VEC.
+	 * This has to be done before copying stuff into tsk->thread.fpr/vr
+	 * for the reasons explained in the previous comment.
+	 */
+	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
+
+#ifdef CONFIG_ALTIVEC
+	err |= __get_user(v_regs, &sc->v_regs);
+	if (err)
+		return err;
+	if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
+		return -EFAULT;
+	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
+	if (v_regs != NULL && (msr & MSR_VEC) != 0) {
+		err |= __copy_from_user(&tsk->thread.vr_state, v_regs,
+					33 * sizeof(vector128));
+		tsk->thread.used_vr = true;
+	} else if (tsk->thread.used_vr) {
+		memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
+	}
+	/* Always get VRSAVE back */
+	if (v_regs != NULL)
+		err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
+	else
+		tsk->thread.vrsave = 0;
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
+#endif /* CONFIG_ALTIVEC */
+	/* restore floating point */
+	err |= copy_fpr_from_user(tsk, &sc->fp_regs);
+#ifdef CONFIG_VSX
+	/*
+	 * Get additional VSX data. Update v_regs to point after the
+	 * VMX data.  Copy VSX low doubleword from userspace to local
+	 * buffer for formatting, then into the taskstruct.
+	 */
+	v_regs += ELF_NVRREG;
+	if ((msr & MSR_VSX) != 0) {
+		err |= copy_vsx_from_user(tsk, v_regs);
+		tsk->thread.used_vsr = true;
+	} else {
+		for (i = 0; i < 32 ; i++)
+			tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+	}
+#endif
+	return err;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Restore the two sigcontexts from the frame of a transactional processes.
+ */
+
+static long restore_tm_sigcontexts(struct task_struct *tsk,
+				   struct sigcontext __user *sc,
+				   struct sigcontext __user *tm_sc)
+{
+#ifdef CONFIG_ALTIVEC
+	elf_vrreg_t __user *v_regs, *tm_v_regs;
+#endif
+	unsigned long err = 0;
+	unsigned long msr;
+	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_VSX
+	int i;
+#endif
+
+	BUG_ON(tsk != current);
+
+	if (tm_suspend_disabled)
+		return -EINVAL;
+
+	/* copy the GPRs */
+	err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
+	err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs,
+				sizeof(regs->gpr));
+
+	/*
+	 * TFHAR is restored from the checkpointed 'wound-back' ucontext's NIP.
+	 * TEXASR was set by the signal delivery reclaim, as was TFIAR.
+	 * Users doing anything abhorrent like thread-switching w/ signals for
+	 * TM-Suspended code will have to back TEXASR/TFIAR up themselves.
+	 * For the case of getting a signal and simply returning from it,
+	 * we don't need to re-copy them here.
+	 */
+	err |= __get_user(regs->nip, &tm_sc->gp_regs[PT_NIP]);
+	err |= __get_user(tsk->thread.tm_tfhar, &sc->gp_regs[PT_NIP]);
+
+	/* get MSR separately, transfer the LE bit if doing signal return */
+	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+	/* Don't allow reserved mode. */
+	if (MSR_TM_RESV(msr))
+		return -EINVAL;
+
+	/* pull in MSR LE from user context */
+	regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
+
+	/* The following non-GPR non-FPR non-VR state is also checkpointed: */
+	err |= __get_user(regs->ctr, &tm_sc->gp_regs[PT_CTR]);
+	err |= __get_user(regs->link, &tm_sc->gp_regs[PT_LNK]);
+	err |= __get_user(regs->xer, &tm_sc->gp_regs[PT_XER]);
+	err |= __get_user(regs->ccr, &tm_sc->gp_regs[PT_CCR]);
+	err |= __get_user(tsk->thread.ckpt_regs.ctr,
+			  &sc->gp_regs[PT_CTR]);
+	err |= __get_user(tsk->thread.ckpt_regs.link,
+			  &sc->gp_regs[PT_LNK]);
+	err |= __get_user(tsk->thread.ckpt_regs.xer,
+			  &sc->gp_regs[PT_XER]);
+	err |= __get_user(tsk->thread.ckpt_regs.ccr,
+			  &sc->gp_regs[PT_CCR]);
+
+	/* Don't allow userspace to set the trap value */
+	regs->trap = 0;
+
+	/* These regs are not checkpointed; they can go in 'regs'. */
+	err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
+	err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
+	err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
+
+	/*
+	 * Force reload of FP/VEC.
+	 * This has to be done before copying stuff into tsk->thread.fpr/vr
+	 * for the reasons explained in the previous comment.
+	 */
+	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
+
+#ifdef CONFIG_ALTIVEC
+	err |= __get_user(v_regs, &sc->v_regs);
+	err |= __get_user(tm_v_regs, &tm_sc->v_regs);
+	if (err)
+		return err;
+	if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
+		return -EFAULT;
+	if (tm_v_regs && !access_ok(VERIFY_READ,
+				    tm_v_regs, 34 * sizeof(vector128)))
+		return -EFAULT;
+	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
+	if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) {
+		err |= __copy_from_user(&tsk->thread.ckvr_state, v_regs,
+					33 * sizeof(vector128));
+		err |= __copy_from_user(&tsk->thread.vr_state, tm_v_regs,
+					33 * sizeof(vector128));
+		current->thread.used_vr = true;
+	}
+	else if (tsk->thread.used_vr) {
+		memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
+		memset(&tsk->thread.ckvr_state, 0, 33 * sizeof(vector128));
+	}
+	/* Always get VRSAVE back */
+	if (v_regs != NULL && tm_v_regs != NULL) {
+		err |= __get_user(tsk->thread.ckvrsave,
+				  (u32 __user *)&v_regs[33]);
+		err |= __get_user(tsk->thread.vrsave,
+				  (u32 __user *)&tm_v_regs[33]);
+	}
+	else {
+		tsk->thread.vrsave = 0;
+		tsk->thread.ckvrsave = 0;
+	}
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
+#endif /* CONFIG_ALTIVEC */
+	/* restore floating point */
+	err |= copy_fpr_from_user(tsk, &tm_sc->fp_regs);
+	err |= copy_ckfpr_from_user(tsk, &sc->fp_regs);
+#ifdef CONFIG_VSX
+	/*
+	 * Get additional VSX data. Update v_regs to point after the
+	 * VMX data.  Copy VSX low doubleword from userspace to local
+	 * buffer for formatting, then into the taskstruct.
+	 */
+	if (v_regs && ((msr & MSR_VSX) != 0)) {
+		v_regs += ELF_NVRREG;
+		tm_v_regs += ELF_NVRREG;
+		err |= copy_vsx_from_user(tsk, tm_v_regs);
+		err |= copy_ckvsx_from_user(tsk, v_regs);
+		tsk->thread.used_vsr = true;
+	} else {
+		for (i = 0; i < 32 ; i++) {
+			tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+			tsk->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+		}
+	}
+#endif
+	tm_enable();
+	/* Make sure the transaction is marked as failed */
+	tsk->thread.tm_texasr |= TEXASR_FS;
+
+	/*
+	 * Disabling preemption, since it is unsafe to be preempted
+	 * with MSR[TS] set without recheckpointing.
+	 */
+	preempt_disable();
+
+	/* pull in MSR TS bits from user context */
+	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+	/*
+	 * Ensure that TM is enabled in regs->msr before we leave the signal
+	 * handler. It could be the case that (a) user disabled the TM bit
+	 * through the manipulation of the MSR bits in uc_mcontext or (b) the
+	 * TM bit was disabled because a sufficient number of context switches
+	 * happened whilst in the signal handler and load_tm overflowed,
+	 * disabling the TM bit. In either case we can end up with an illegal
+	 * TM state leading to a TM Bad Thing when we return to userspace.
+	 *
+	 * CAUTION:
+	 * After regs->MSR[TS] being updated, make sure that get_user(),
+	 * put_user() or similar functions are *not* called. These
+	 * functions can generate page faults which will cause the process
+	 * to be de-scheduled with MSR[TS] set but without calling
+	 * tm_recheckpoint(). This can cause a bug.
+	 */
+	regs->msr |= MSR_TM;
+
+	/* This loads the checkpointed FP/VEC state, if used */
+	tm_recheckpoint(&tsk->thread);
+
+	msr_check_and_set(msr & (MSR_FP | MSR_VEC));
+	if (msr & MSR_FP) {
+		load_fp_state(&tsk->thread.fp_state);
+		regs->msr |= (MSR_FP | tsk->thread.fpexc_mode);
+	}
+	if (msr & MSR_VEC) {
+		load_vr_state(&tsk->thread.vr_state);
+		regs->msr |= MSR_VEC;
+	}
+
+	preempt_enable();
+
+	return err;
+}
+#endif
+
+/*
+ * Setup the trampoline code on the stack
+ */
+static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp)
+{
+	int i;
+	long err = 0;
+
+	/* addi r1, r1, __SIGNAL_FRAMESIZE  # Pop the dummy stackframe */
+	err |= __put_user(0x38210000UL | (__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]);
+	/* li r0, __NR_[rt_]sigreturn| */
+	err |= __put_user(0x38000000UL | (syscall & 0xffff), &tramp[1]);
+	/* sc */
+	err |= __put_user(0x44000002UL, &tramp[2]);
+
+	/* Minimal traceback info */
+	for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
+		err |= __put_user(0, &tramp[i]);
+
+	if (!err)
+		flush_icache_range((unsigned long) &tramp[0],
+			   (unsigned long) &tramp[TRAMP_SIZE]);
+
+	return err;
+}
+
+/*
+ * Userspace code may pass a ucontext which doesn't include VSX added
+ * at the end.  We need to check for this case.
+ */
+#define UCONTEXTSIZEWITHOUTVSX \
+		(sizeof(struct ucontext) - 32*sizeof(long))
+
+/*
+ * Handle {get,set,swap}_context operations
+ */
+SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
+		struct ucontext __user *, new_ctx, long, ctx_size)
+{
+	unsigned char tmp;
+	sigset_t set;
+	unsigned long new_msr = 0;
+	int ctx_has_vsx_region = 0;
+
+	if (new_ctx &&
+	    get_user(new_msr, &new_ctx->uc_mcontext.gp_regs[PT_MSR]))
+		return -EFAULT;
+	/*
+	 * Check that the context is not smaller than the original
+	 * size (with VMX but without VSX)
+	 */
+	if (ctx_size < UCONTEXTSIZEWITHOUTVSX)
+		return -EINVAL;
+	/*
+	 * If the new context state sets the MSR VSX bits but
+	 * it doesn't provide VSX state.
+	 */
+	if ((ctx_size < sizeof(struct ucontext)) &&
+	    (new_msr & MSR_VSX))
+		return -EINVAL;
+	/* Does the context have enough room to store VSX data? */
+	if (ctx_size >= sizeof(struct ucontext))
+		ctx_has_vsx_region = 1;
+
+	if (old_ctx != NULL) {
+		if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
+		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
+					ctx_has_vsx_region)
+		    || __copy_to_user(&old_ctx->uc_sigmask,
+				      &current->blocked, sizeof(sigset_t)))
+			return -EFAULT;
+	}
+	if (new_ctx == NULL)
+		return 0;
+	if (!access_ok(VERIFY_READ, new_ctx, ctx_size)
+	    || __get_user(tmp, (u8 __user *) new_ctx)
+	    || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1))
+		return -EFAULT;
+
+	/*
+	 * If we get a fault copying the context into the kernel's
+	 * image of the user's registers, we can't just return -EFAULT
+	 * because the user's registers will be corrupted.  For instance
+	 * the NIP value may have been updated but not some of the
+	 * other registers.  Given that we have done the access_ok
+	 * and successfully read the first and last bytes of the region
+	 * above, this should only happen in an out-of-memory situation
+	 * or if another thread unmaps the region containing the context.
+	 * We kill the task with a SIGSEGV in this situation.
+	 */
+
+	if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
+		do_exit(SIGSEGV);
+	set_current_blocked(&set);
+	if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext))
+		do_exit(SIGSEGV);
+
+	/* This returns like rt_sigreturn */
+	set_thread_flag(TIF_RESTOREALL);
+	return 0;
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+	struct pt_regs *regs = current_pt_regs();
+	struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1];
+	sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	unsigned long msr;
+#endif
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current->restart_block.fn = do_no_restart_syscall;
+
+	if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
+		goto badframe;
+
+	if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
+		goto badframe;
+	set_current_blocked(&set);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * If there is a transactional state then throw it away.
+	 * The purpose of a sigreturn is to destroy all traces of the
+	 * signal frame, this includes any transactional state created
+	 * within in. We only check for suspended as we can never be
+	 * active in the kernel, we are active, there is nothing better to
+	 * do than go ahead and Bad Thing later.
+	 * The cause is not important as there will never be a
+	 * recheckpoint so it's not user visible.
+	 */
+	if (MSR_TM_SUSPENDED(mfmsr()))
+		tm_reclaim_current(0);
+
+	if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
+		goto badframe;
+	if (MSR_TM_ACTIVE(msr)) {
+		/* We recheckpoint on return. */
+		struct ucontext __user *uc_transact;
+
+		/* Trying to start TM on non TM system */
+		if (!cpu_has_feature(CPU_FTR_TM))
+			goto badframe;
+
+		if (__get_user(uc_transact, &uc->uc_link))
+			goto badframe;
+		if (restore_tm_sigcontexts(current, &uc->uc_mcontext,
+					   &uc_transact->uc_mcontext))
+			goto badframe;
+	} else
+#endif
+	{
+		/*
+		 * Fall through, for non-TM restore
+		 *
+		 * Unset MSR[TS] on the thread regs since MSR from user
+		 * context does not have MSR active, and recheckpoint was
+		 * not called since restore_tm_sigcontexts() was not called
+		 * also.
+		 *
+		 * If not unsetting it, the code can RFID to userspace with
+		 * MSR[TS] set, but without CPU in the proper state,
+		 * causing a TM bad thing.
+		 */
+		current->thread.regs->msr &= ~MSR_TS_MASK;
+		if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+			goto badframe;
+	}
+
+	if (restore_altstack(&uc->uc_stack))
+		goto badframe;
+
+	set_thread_flag(TIF_RESTOREALL);
+	return 0;
+
+badframe:
+	if (show_unhandled_signals)
+		printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+				   current->comm, current->pid, "rt_sigreturn",
+				   (long)uc, regs->nip, regs->link);
+
+	force_sig(SIGSEGV, current);
+	return 0;
+}
+
+int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
+		struct task_struct *tsk)
+{
+	struct rt_sigframe __user *frame;
+	unsigned long newsp = 0;
+	long err = 0;
+	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
+
+	BUG_ON(tsk != current);
+
+	frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
+	if (unlikely(frame == NULL))
+		goto badframe;
+
+	err |= __put_user(&frame->info, &frame->pinfo);
+	err |= __put_user(&frame->uc, &frame->puc);
+	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
+	if (err)
+		goto badframe;
+
+	/* Create the ucontext.  */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (MSR_TM_ACTIVE(msr)) {
+		/* The ucontext_t passed to userland points to the second
+		 * ucontext_t (for transactional state) with its uc_link ptr.
+		 */
+		err |= __put_user(&frame->uc_transact, &frame->uc.uc_link);
+		err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
+					    &frame->uc_transact.uc_mcontext,
+					    tsk, ksig->sig, NULL,
+					    (unsigned long)ksig->ka.sa.sa_handler,
+					    msr);
+	} else
+#endif
+	{
+		err |= __put_user(0, &frame->uc.uc_link);
+		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
+					NULL, (unsigned long)ksig->ka.sa.sa_handler,
+					1);
+	}
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+	if (err)
+		goto badframe;
+
+	/* Make sure signal handler doesn't get spurious FP exceptions */
+	tsk->thread.fp_state.fpscr = 0;
+
+	/* Set up to return from userspace. */
+	if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
+		regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
+	} else {
+		err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
+		if (err)
+			goto badframe;
+		regs->link = (unsigned long) &frame->tramp[0];
+	}
+
+	/* Allocate a dummy caller frame for the signal handler. */
+	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
+	err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
+
+	/* Set up "regs" so we "return" to the signal handler. */
+	if (is_elf2_task()) {
+		regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
+		regs->gpr[12] = regs->nip;
+	} else {
+		/* Handler is *really* a pointer to the function descriptor for
+		 * the signal routine.  The first entry in the function
+		 * descriptor is the entry address of signal and the second
+		 * entry is the TOC value we need to use.
+		 */
+		func_descr_t __user *funct_desc_ptr =
+			(func_descr_t __user *) ksig->ka.sa.sa_handler;
+
+		err |= get_user(regs->nip, &funct_desc_ptr->entry);
+		err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
+	}
+
+	/* enter the signal handler in native-endian mode */
+	regs->msr &= ~MSR_LE;
+	regs->msr |= (MSR_KERNEL & MSR_LE);
+	regs->gpr[1] = newsp;
+	regs->gpr[3] = ksig->sig;
+	regs->result = 0;
+	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+		err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
+		err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);
+		regs->gpr[6] = (unsigned long) frame;
+	} else {
+		regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext;
+	}
+	if (err)
+		goto badframe;
+
+	return 0;
+
+badframe:
+	if (show_unhandled_signals)
+		printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+				   tsk->comm, tsk->pid, "setup_rt_frame",
+				   (long)frame, regs->nip, regs->link);
+
+	return 1;
+}
diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c
new file mode 100644
index 000000000..21c39355b
--- /dev/null
+++ b/arch/powerpc/kernel/smp-tbsync.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Smp timebase synchronization for ppc.
+ *
+ * Copyright (C) 2003 Samuel Rydh (samuel@ibrium.se)
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <asm/smp.h>
+#include <asm/time.h>
+
+#define NUM_ITER		300
+
+enum {
+	kExit=0, kSetAndTest, kTest
+};
+
+static struct {
+	volatile u64		tb;
+	volatile u64		mark;
+	volatile int		cmd;
+	volatile int		handshake;
+	int			filler[2];
+
+	volatile int		ack;
+	int			filler2[7];
+
+	volatile int		race_result;
+} *tbsync;
+
+static volatile int		running;
+
+static void enter_contest(u64 mark, long add)
+{
+	while (get_tb() < mark)
+		tbsync->race_result = add;
+}
+
+void smp_generic_take_timebase(void)
+{
+	int cmd;
+	u64 tb;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	while (!running)
+		barrier();
+	rmb();
+
+	for (;;) {
+		tbsync->ack = 1;
+		while (!tbsync->handshake)
+			barrier();
+		rmb();
+
+		cmd = tbsync->cmd;
+		tb = tbsync->tb;
+		mb();
+		tbsync->ack = 0;
+		if (cmd == kExit)
+			break;
+
+		while (tbsync->handshake)
+			barrier();
+		if (cmd == kSetAndTest)
+			set_tb(tb >> 32, tb & 0xfffffffful);
+		enter_contest(tbsync->mark, -1);
+	}
+	local_irq_restore(flags);
+}
+
+static int start_contest(int cmd, long offset, int num)
+{
+	int i, score=0;
+	u64 tb;
+	u64 mark;
+
+	tbsync->cmd = cmd;
+
+	local_irq_disable();
+	for (i = -3; i < num; ) {
+		tb = get_tb() + 400;
+		tbsync->tb = tb + offset;
+		tbsync->mark = mark = tb + 400;
+
+		wmb();
+
+		tbsync->handshake = 1;
+		while (tbsync->ack)
+			barrier();
+
+		while (get_tb() <= tb)
+			barrier();
+		tbsync->handshake = 0;
+		enter_contest(mark, 1);
+
+		while (!tbsync->ack)
+			barrier();
+
+		if (i++ > 0)
+			score += tbsync->race_result;
+	}
+	local_irq_enable();
+	return score;
+}
+
+void smp_generic_give_timebase(void)
+{
+	int i, score, score2, old, min=0, max=5000, offset=1000;
+
+	pr_debug("Software timebase sync\n");
+
+	/* if this fails then this kernel won't work anyway... */
+	tbsync = kzalloc( sizeof(*tbsync), GFP_KERNEL );
+	mb();
+	running = 1;
+
+	while (!tbsync->ack)
+		barrier();
+
+	pr_debug("Got ack\n");
+
+	/* binary search */
+	for (old = -1; old != offset ; offset = (min+max) / 2) {
+		score = start_contest(kSetAndTest, offset, NUM_ITER);
+
+		pr_debug("score %d, offset %d\n", score, offset );
+
+		if( score > 0 )
+			max = offset;
+		else
+			min = offset;
+		old = offset;
+	}
+	score = start_contest(kSetAndTest, min, NUM_ITER);
+	score2 = start_contest(kSetAndTest, max, NUM_ITER);
+
+	pr_debug("Min %d (score %d), Max %d (score %d)\n",
+		 min, score, max, score2);
+	score = abs(score);
+	score2 = abs(score2);
+	offset = (score < score2) ? min : max;
+
+	/* guard against inaccurate mttb */
+	for (i = 0; i < 10; i++) {
+		start_contest(kSetAndTest, offset, NUM_ITER/10);
+
+		if ((score2 = start_contest(kTest, offset, NUM_ITER)) < 0)
+			score2 = -score2;
+		if (score2 <= score || score2 < 20)
+			break;
+	}
+	pr_debug("Final offset: %d (%d/%d)\n", offset, score2, NUM_ITER );
+
+	/* exiting */
+	tbsync->cmd = kExit;
+	wmb();
+	tbsync->handshake = 1;
+	while (tbsync->ack)
+		barrier();
+	tbsync->handshake = 0;
+	kfree(tbsync);
+	tbsync = NULL;
+	running = 0;
+}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
new file mode 100644
index 000000000..60fc3c71a
--- /dev/null
+++ b/arch/powerpc/kernel/smp.c
@@ -0,0 +1,1234 @@
+/*
+ * SMP support for ppc.
+ *
+ * Written by Cort Dougan (cort@cs.nmt.edu) borrowing a great
+ * deal of code from the sparc and intel versions.
+ *
+ * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
+ *
+ * PowerPC-64 Support added by Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/profile.h>
+#include <linux/processor.h>
+
+#include <asm/ptrace.h>
+#include <linux/atomic.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/time.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/cputable.h>
+#include <asm/mpic.h>
+#include <asm/vdso_datapage.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#endif
+#include <asm/vdso.h>
+#include <asm/debug.h>
+#include <asm/kexec.h>
+#include <asm/asm-prototypes.h>
+#include <asm/cpu_has_feature.h>
+#include <asm/ftrace.h>
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* State of each CPU during hotplug phases */
+static DEFINE_PER_CPU(int, cpu_state) = { 0 };
+#endif
+
+struct thread_info *secondary_ti;
+
+DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+
+EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
+EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+
+/* SMP operations for this machine */
+struct smp_ops_t *smp_ops;
+
+/* Can't be static due to PowerMac hackery */
+volatile unsigned int cpu_callin_map[NR_CPUS];
+
+int smt_enabled_at_boot = 1;
+
+/*
+ * Returns 1 if the specified cpu should be brought up during boot.
+ * Used to inhibit booting threads if they've been disabled or
+ * limited on the command line
+ */
+int smp_generic_cpu_bootable(unsigned int nr)
+{
+	/* Special case - we inhibit secondary thread startup
+	 * during boot if the user requests it.
+	 */
+	if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+		if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
+			return 0;
+		if (smt_enabled_at_boot
+		    && cpu_thread_in_core(nr) >= smt_enabled_at_boot)
+			return 0;
+	}
+
+	return 1;
+}
+
+
+#ifdef CONFIG_PPC64
+int smp_generic_kick_cpu(int nr)
+{
+	if (nr < 0 || nr >= nr_cpu_ids)
+		return -EINVAL;
+
+	/*
+	 * The processor is currently spinning, waiting for the
+	 * cpu_start field to become non-zero After we set cpu_start,
+	 * the processor will continue on to secondary_start
+	 */
+	if (!paca_ptrs[nr]->cpu_start) {
+		paca_ptrs[nr]->cpu_start = 1;
+		smp_mb();
+		return 0;
+	}
+
+#ifdef CONFIG_HOTPLUG_CPU
+	/*
+	 * Ok it's not there, so it might be soft-unplugged, let's
+	 * try to bring it back
+	 */
+	generic_set_cpu_up(nr);
+	smp_wmb();
+	smp_send_reschedule(nr);
+#endif /* CONFIG_HOTPLUG_CPU */
+
+	return 0;
+}
+#endif /* CONFIG_PPC64 */
+
+static irqreturn_t call_function_action(int irq, void *data)
+{
+	generic_smp_call_function_interrupt();
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t reschedule_action(int irq, void *data)
+{
+	scheduler_ipi();
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
+{
+	timer_broadcast_interrupt();
+	return IRQ_HANDLED;
+}
+#endif
+
+#ifdef CONFIG_NMI_IPI
+static irqreturn_t nmi_ipi_action(int irq, void *data)
+{
+	smp_handle_nmi_ipi(get_irq_regs());
+	return IRQ_HANDLED;
+}
+#endif
+
+static irq_handler_t smp_ipi_action[] = {
+	[PPC_MSG_CALL_FUNCTION] =  call_function_action,
+	[PPC_MSG_RESCHEDULE] = reschedule_action,
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+	[PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
+#endif
+#ifdef CONFIG_NMI_IPI
+	[PPC_MSG_NMI_IPI] = nmi_ipi_action,
+#endif
+};
+
+/*
+ * The NMI IPI is a fallback and not truly non-maskable. It is simpler
+ * than going through the call function infrastructure, and strongly
+ * serialized, so it is more appropriate for debugging.
+ */
+const char *smp_ipi_name[] = {
+	[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
+	[PPC_MSG_RESCHEDULE] = "ipi reschedule",
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+	[PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
+#endif
+#ifdef CONFIG_NMI_IPI
+	[PPC_MSG_NMI_IPI] = "nmi ipi",
+#endif
+};
+
+/* optional function to request ipi, for controllers with >= 4 ipis */
+int smp_request_message_ipi(int virq, int msg)
+{
+	int err;
+
+	if (msg < 0 || msg > PPC_MSG_NMI_IPI)
+		return -EINVAL;
+#ifndef CONFIG_NMI_IPI
+	if (msg == PPC_MSG_NMI_IPI)
+		return 1;
+#endif
+
+	err = request_irq(virq, smp_ipi_action[msg],
+			  IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND,
+			  smp_ipi_name[msg], NULL);
+	WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n",
+		virq, smp_ipi_name[msg], err);
+
+	return err;
+}
+
+#ifdef CONFIG_PPC_SMP_MUXED_IPI
+struct cpu_messages {
+	long messages;			/* current messages */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
+
+void smp_muxed_ipi_set_message(int cpu, int msg)
+{
+	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+	char *message = (char *)&info->messages;
+
+	/*
+	 * Order previous accesses before accesses in the IPI handler.
+	 */
+	smp_mb();
+	message[msg] = 1;
+}
+
+void smp_muxed_ipi_message_pass(int cpu, int msg)
+{
+	smp_muxed_ipi_set_message(cpu, msg);
+
+	/*
+	 * cause_ipi functions are required to include a full barrier
+	 * before doing whatever causes the IPI.
+	 */
+	smp_ops->cause_ipi(cpu);
+}
+
+#ifdef __BIG_ENDIAN__
+#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
+#else
+#define IPI_MESSAGE(A) (1uL << (8 * (A)))
+#endif
+
+irqreturn_t smp_ipi_demux(void)
+{
+	mb();	/* order any irq clear */
+
+	return smp_ipi_demux_relaxed();
+}
+
+/* sync-free variant. Callers should ensure synchronization */
+irqreturn_t smp_ipi_demux_relaxed(void)
+{
+	struct cpu_messages *info;
+	unsigned long all;
+
+	info = this_cpu_ptr(&ipi_message);
+	do {
+		all = xchg(&info->messages, 0);
+#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+		/*
+		 * Must check for PPC_MSG_RM_HOST_ACTION messages
+		 * before PPC_MSG_CALL_FUNCTION messages because when
+		 * a VM is destroyed, we call kick_all_cpus_sync()
+		 * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+		 * messages have completed before we free any VCPUs.
+		 */
+		if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+			kvmppc_xics_ipi_action();
+#endif
+		if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
+			generic_smp_call_function_interrupt();
+		if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
+			scheduler_ipi();
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+		if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
+			timer_broadcast_interrupt();
+#endif
+#ifdef CONFIG_NMI_IPI
+		if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI))
+			nmi_ipi_action(0, NULL);
+#endif
+	} while (info->messages);
+
+	return IRQ_HANDLED;
+}
+#endif /* CONFIG_PPC_SMP_MUXED_IPI */
+
+static inline void do_message_pass(int cpu, int msg)
+{
+	if (smp_ops->message_pass)
+		smp_ops->message_pass(cpu, msg);
+#ifdef CONFIG_PPC_SMP_MUXED_IPI
+	else
+		smp_muxed_ipi_message_pass(cpu, msg);
+#endif
+}
+
+void smp_send_reschedule(int cpu)
+{
+	if (likely(smp_ops))
+		do_message_pass(cpu, PPC_MSG_RESCHEDULE);
+}
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
+
+void arch_send_call_function_single_ipi(int cpu)
+{
+	do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+}
+
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, mask)
+		do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+}
+
+#ifdef CONFIG_NMI_IPI
+
+/*
+ * "NMI IPI" system.
+ *
+ * NMI IPIs may not be recoverable, so should not be used as ongoing part of
+ * a running system. They can be used for crash, debug, halt/reboot, etc.
+ *
+ * The IPI call waits with interrupts disabled until all targets enter the
+ * NMI handler, then returns. Subsequent IPIs can be issued before targets
+ * have returned from their handlers, so there is no guarantee about
+ * concurrency or re-entrancy.
+ *
+ * A new NMI can be issued before all targets exit the handler.
+ *
+ * The IPI call may time out without all targets entering the NMI handler.
+ * In that case, there is some logic to recover (and ignore subsequent
+ * NMI interrupts that may eventually be raised), but the platform interrupt
+ * handler may not be able to distinguish this from other exception causes,
+ * which may cause a crash.
+ */
+
+static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
+static struct cpumask nmi_ipi_pending_mask;
+static bool nmi_ipi_busy = false;
+static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
+
+static void nmi_ipi_lock_start(unsigned long *flags)
+{
+	raw_local_irq_save(*flags);
+	hard_irq_disable();
+	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
+		raw_local_irq_restore(*flags);
+		spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
+		raw_local_irq_save(*flags);
+		hard_irq_disable();
+	}
+}
+
+static void nmi_ipi_lock(void)
+{
+	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
+		spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
+}
+
+static void nmi_ipi_unlock(void)
+{
+	smp_mb();
+	WARN_ON(atomic_read(&__nmi_ipi_lock) != 1);
+	atomic_set(&__nmi_ipi_lock, 0);
+}
+
+static void nmi_ipi_unlock_end(unsigned long *flags)
+{
+	nmi_ipi_unlock();
+	raw_local_irq_restore(*flags);
+}
+
+/*
+ * Platform NMI handler calls this to ack
+ */
+int smp_handle_nmi_ipi(struct pt_regs *regs)
+{
+	void (*fn)(struct pt_regs *) = NULL;
+	unsigned long flags;
+	int me = raw_smp_processor_id();
+	int ret = 0;
+
+	/*
+	 * Unexpected NMIs are possible here because the interrupt may not
+	 * be able to distinguish NMI IPIs from other types of NMIs, or
+	 * because the caller may have timed out.
+	 */
+	nmi_ipi_lock_start(&flags);
+	if (cpumask_test_cpu(me, &nmi_ipi_pending_mask)) {
+		cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+		fn = READ_ONCE(nmi_ipi_function);
+		WARN_ON_ONCE(!fn);
+		ret = 1;
+	}
+	nmi_ipi_unlock_end(&flags);
+
+	if (fn)
+		fn(regs);
+
+	return ret;
+}
+
+static void do_smp_send_nmi_ipi(int cpu, bool safe)
+{
+	if (!safe && smp_ops->cause_nmi_ipi && smp_ops->cause_nmi_ipi(cpu))
+		return;
+
+	if (cpu >= 0) {
+		do_message_pass(cpu, PPC_MSG_NMI_IPI);
+	} else {
+		int c;
+
+		for_each_online_cpu(c) {
+			if (c == raw_smp_processor_id())
+				continue;
+			do_message_pass(c, PPC_MSG_NMI_IPI);
+		}
+	}
+}
+
+/*
+ * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
+ * - fn is the target callback function.
+ * - delay_us > 0 is the delay before giving up waiting for targets to
+ *   begin executing the handler, == 0 specifies indefinite delay.
+ */
+int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe)
+{
+	unsigned long flags;
+	int me = raw_smp_processor_id();
+	int ret = 1;
+
+	BUG_ON(cpu == me);
+	BUG_ON(cpu < 0 && cpu != NMI_IPI_ALL_OTHERS);
+
+	if (unlikely(!smp_ops))
+		return 0;
+
+	nmi_ipi_lock_start(&flags);
+	while (nmi_ipi_busy) {
+		nmi_ipi_unlock_end(&flags);
+		spin_until_cond(!nmi_ipi_busy);
+		nmi_ipi_lock_start(&flags);
+	}
+	nmi_ipi_busy = true;
+	nmi_ipi_function = fn;
+
+	WARN_ON_ONCE(!cpumask_empty(&nmi_ipi_pending_mask));
+
+	if (cpu < 0) {
+		/* ALL_OTHERS */
+		cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
+		cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+	} else {
+		cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
+	}
+
+	nmi_ipi_unlock();
+
+	/* Interrupts remain hard disabled */
+
+	do_smp_send_nmi_ipi(cpu, safe);
+
+	nmi_ipi_lock();
+	/* nmi_ipi_busy is set here, so unlock/lock is okay */
+	while (!cpumask_empty(&nmi_ipi_pending_mask)) {
+		nmi_ipi_unlock();
+		udelay(1);
+		nmi_ipi_lock();
+		if (delay_us) {
+			delay_us--;
+			if (!delay_us)
+				break;
+		}
+	}
+
+	if (!cpumask_empty(&nmi_ipi_pending_mask)) {
+		/* Timeout waiting for CPUs to call smp_handle_nmi_ipi */
+		ret = 0;
+		cpumask_clear(&nmi_ipi_pending_mask);
+	}
+
+	nmi_ipi_function = NULL;
+	nmi_ipi_busy = false;
+
+	nmi_ipi_unlock_end(&flags);
+
+	return ret;
+}
+
+int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+{
+	return __smp_send_nmi_ipi(cpu, fn, delay_us, false);
+}
+
+int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+{
+	return __smp_send_nmi_ipi(cpu, fn, delay_us, true);
+}
+#endif /* CONFIG_NMI_IPI */
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void tick_broadcast(const struct cpumask *mask)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, mask)
+		do_message_pass(cpu, PPC_MSG_TICK_BROADCAST);
+}
+#endif
+
+#ifdef CONFIG_DEBUGGER
+void debugger_ipi_callback(struct pt_regs *regs)
+{
+	debugger_ipi(regs);
+}
+
+void smp_send_debugger_break(void)
+{
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000);
+}
+#endif
+
+#ifdef CONFIG_KEXEC_CORE
+void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
+{
+	int cpu;
+
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
+	if (kdump_in_progress() && crash_wake_offline) {
+		for_each_present_cpu(cpu) {
+			if (cpu_online(cpu))
+				continue;
+			/*
+			 * crash_ipi_callback will wait for
+			 * all cpus, including offline CPUs.
+			 * We don't care about nmi_ipi_function.
+			 * Offline cpus will jump straight into
+			 * crash_ipi_callback, we can skip the
+			 * entire NMI dance and waiting for
+			 * cpus to clear pending mask, etc.
+			 */
+			do_smp_send_nmi_ipi(cpu, false);
+		}
+	}
+}
+#endif
+
+#ifdef CONFIG_NMI_IPI
+static void crash_stop_this_cpu(struct pt_regs *regs)
+#else
+static void crash_stop_this_cpu(void *dummy)
+#endif
+{
+	/*
+	 * Just busy wait here and avoid marking CPU as offline to ensure
+	 * register data is captured appropriately.
+	 */
+	while (1)
+		cpu_relax();
+}
+
+void crash_smp_send_stop(void)
+{
+	static bool stopped = false;
+
+	if (stopped)
+		return;
+
+	stopped = true;
+
+#ifdef CONFIG_NMI_IPI
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_stop_this_cpu, 1000000);
+#else
+	smp_call_function(crash_stop_this_cpu, NULL, 0);
+#endif /* CONFIG_NMI_IPI */
+}
+
+#ifdef CONFIG_NMI_IPI
+static void nmi_stop_this_cpu(struct pt_regs *regs)
+{
+	/*
+	 * IRQs are already hard disabled by the smp_handle_nmi_ipi.
+	 */
+	set_cpu_online(smp_processor_id(), false);
+
+	spin_begin();
+	while (1)
+		spin_cpu_relax();
+}
+
+void smp_send_stop(void)
+{
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, nmi_stop_this_cpu, 1000000);
+}
+
+#else /* CONFIG_NMI_IPI */
+
+static void stop_this_cpu(void *dummy)
+{
+	hard_irq_disable();
+
+	/*
+	 * Offlining CPUs in stop_this_cpu can result in scheduler warnings,
+	 * (see commit de6e5d38417e), but printk_safe_flush_on_panic() wants
+	 * to know other CPUs are offline before it breaks locks to flush
+	 * printk buffers, in case we panic()ed while holding the lock.
+	 */
+	set_cpu_online(smp_processor_id(), false);
+
+	spin_begin();
+	while (1)
+		spin_cpu_relax();
+}
+
+void smp_send_stop(void)
+{
+	static bool stopped = false;
+
+	/*
+	 * Prevent waiting on csd lock from a previous smp_send_stop.
+	 * This is racy, but in general callers try to do the right
+	 * thing and only fire off one smp_send_stop (e.g., see
+	 * kernel/panic.c)
+	 */
+	if (stopped)
+		return;
+
+	stopped = true;
+
+	smp_call_function(stop_this_cpu, NULL, 0);
+}
+#endif /* CONFIG_NMI_IPI */
+
+struct thread_info *current_set[NR_CPUS];
+
+static void smp_store_cpu_info(int id)
+{
+	per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR);
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	per_cpu(next_tlbcam_idx, id)
+		= (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
+#endif
+}
+
+/*
+ * Relationships between CPUs are maintained in a set of per-cpu cpumasks so
+ * rather than just passing around the cpumask we pass around a function that
+ * returns the that cpumask for the given CPU.
+ */
+static void set_cpus_related(int i, int j, struct cpumask *(*get_cpumask)(int))
+{
+	cpumask_set_cpu(i, get_cpumask(j));
+	cpumask_set_cpu(j, get_cpumask(i));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void set_cpus_unrelated(int i, int j,
+		struct cpumask *(*get_cpumask)(int))
+{
+	cpumask_clear_cpu(i, get_cpumask(j));
+	cpumask_clear_cpu(j, get_cpumask(i));
+}
+#endif
+
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned int cpu;
+
+	DBG("smp_prepare_cpus\n");
+
+	/* 
+	 * setup_cpu may need to be called on the boot cpu. We havent
+	 * spun any cpus up but lets be paranoid.
+	 */
+	BUG_ON(boot_cpuid != smp_processor_id());
+
+	/* Fixup boot cpu */
+	smp_store_cpu_info(boot_cpuid);
+	cpu_callin_map[boot_cpuid] = 1;
+
+	for_each_possible_cpu(cpu) {
+		zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu),
+					GFP_KERNEL, cpu_to_node(cpu));
+		zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu),
+					GFP_KERNEL, cpu_to_node(cpu));
+		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
+					GFP_KERNEL, cpu_to_node(cpu));
+		/*
+		 * numa_node_id() works after this.
+		 */
+		if (cpu_present(cpu)) {
+			set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
+			set_cpu_numa_mem(cpu,
+				local_memory_node(numa_cpu_lookup_table[cpu]));
+		}
+	}
+
+	/* Init the cpumasks so the boot CPU is related to itself */
+	cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
+	cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
+	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
+
+	if (smp_ops && smp_ops->probe)
+		smp_ops->probe();
+}
+
+void smp_prepare_boot_cpu(void)
+{
+	BUG_ON(smp_processor_id() != boot_cpuid);
+#ifdef CONFIG_PPC64
+	paca_ptrs[boot_cpuid]->__current = current;
+#endif
+	set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
+	current_set[boot_cpuid] = task_thread_info(current);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+int generic_cpu_disable(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	if (cpu == boot_cpuid)
+		return -EBUSY;
+
+	set_cpu_online(cpu, false);
+#ifdef CONFIG_PPC64
+	vdso_data->processorCount--;
+#endif
+	/* Update affinity of all IRQs previously aimed at this CPU */
+	irq_migrate_all_off_this_cpu();
+
+	/*
+	 * Depending on the details of the interrupt controller, it's possible
+	 * that one of the interrupts we just migrated away from this CPU is
+	 * actually already pending on this CPU. If we leave it in that state
+	 * the interrupt will never be EOI'ed, and will never fire again. So
+	 * temporarily enable interrupts here, to allow any pending interrupt to
+	 * be received (and EOI'ed), before we take this CPU offline.
+	 */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+
+	return 0;
+}
+
+void generic_cpu_die(unsigned int cpu)
+{
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		smp_rmb();
+		if (is_cpu_dead(cpu))
+			return;
+		msleep(100);
+	}
+	printk(KERN_ERR "CPU%d didn't die...\n", cpu);
+}
+
+void generic_set_cpu_dead(unsigned int cpu)
+{
+	per_cpu(cpu_state, cpu) = CPU_DEAD;
+}
+
+/*
+ * The cpu_state should be set to CPU_UP_PREPARE in kick_cpu(), otherwise
+ * the cpu_state is always CPU_DEAD after calling generic_set_cpu_dead(),
+ * which makes the delay in generic_cpu_die() not happen.
+ */
+void generic_set_cpu_up(unsigned int cpu)
+{
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+}
+
+int generic_check_cpu_restart(unsigned int cpu)
+{
+	return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
+}
+
+int is_cpu_dead(unsigned int cpu)
+{
+	return per_cpu(cpu_state, cpu) == CPU_DEAD;
+}
+
+static bool secondaries_inhibited(void)
+{
+	return kvm_hv_mode_active();
+}
+
+#else /* HOTPLUG_CPU */
+
+#define secondaries_inhibited()		0
+
+#endif
+
+static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
+{
+	struct thread_info *ti = task_thread_info(idle);
+
+#ifdef CONFIG_PPC64
+	paca_ptrs[cpu]->__current = idle;
+	paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD;
+#endif
+	ti->cpu = cpu;
+	secondary_ti = current_set[cpu] = ti;
+}
+
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+	int rc, c;
+
+	/*
+	 * Don't allow secondary threads to come online if inhibited
+	 */
+	if (threads_per_core > 1 && secondaries_inhibited() &&
+	    cpu_thread_in_subcore(cpu))
+		return -EBUSY;
+
+	if (smp_ops == NULL ||
+	    (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
+		return -EINVAL;
+
+	cpu_idle_thread_init(cpu, tidle);
+
+	/*
+	 * The platform might need to allocate resources prior to bringing
+	 * up the CPU
+	 */
+	if (smp_ops->prepare_cpu) {
+		rc = smp_ops->prepare_cpu(cpu);
+		if (rc)
+			return rc;
+	}
+
+	/* Make sure callin-map entry is 0 (can be leftover a CPU
+	 * hotplug
+	 */
+	cpu_callin_map[cpu] = 0;
+
+	/* The information for processor bringup must
+	 * be written out to main store before we release
+	 * the processor.
+	 */
+	smp_mb();
+
+	/* wake up cpus */
+	DBG("smp: kicking cpu %d\n", cpu);
+	rc = smp_ops->kick_cpu(cpu);
+	if (rc) {
+		pr_err("smp: failed starting cpu %d (rc %d)\n", cpu, rc);
+		return rc;
+	}
+
+	/*
+	 * wait to see if the cpu made a callin (is actually up).
+	 * use this value that I found through experimentation.
+	 * -- Cort
+	 */
+	if (system_state < SYSTEM_RUNNING)
+		for (c = 50000; c && !cpu_callin_map[cpu]; c--)
+			udelay(100);
+#ifdef CONFIG_HOTPLUG_CPU
+	else
+		/*
+		 * CPUs can take much longer to come up in the
+		 * hotplug case.  Wait five seconds.
+		 */
+		for (c = 5000; c && !cpu_callin_map[cpu]; c--)
+			msleep(1);
+#endif
+
+	if (!cpu_callin_map[cpu]) {
+		printk(KERN_ERR "Processor %u is stuck.\n", cpu);
+		return -ENOENT;
+	}
+
+	DBG("Processor %u found.\n", cpu);
+
+	if (smp_ops->give_timebase)
+		smp_ops->give_timebase();
+
+	/* Wait until cpu puts itself in the online & active maps */
+	spin_until_cond(cpu_online(cpu));
+
+	return 0;
+}
+
+/* Return the value of the reg property corresponding to the given
+ * logical cpu.
+ */
+int cpu_to_core_id(int cpu)
+{
+	struct device_node *np;
+	const __be32 *reg;
+	int id = -1;
+
+	np = of_get_cpu_node(cpu, NULL);
+	if (!np)
+		goto out;
+
+	reg = of_get_property(np, "reg", NULL);
+	if (!reg)
+		goto out;
+
+	id = be32_to_cpup(reg);
+out:
+	of_node_put(np);
+	return id;
+}
+EXPORT_SYMBOL_GPL(cpu_to_core_id);
+
+/* Helper routines for cpu to core mapping */
+int cpu_core_index_of_thread(int cpu)
+{
+	return cpu >> threads_shift;
+}
+EXPORT_SYMBOL_GPL(cpu_core_index_of_thread);
+
+int cpu_first_thread_of_core(int core)
+{
+	return core << threads_shift;
+}
+EXPORT_SYMBOL_GPL(cpu_first_thread_of_core);
+
+/* Must be called when no change can occur to cpu_present_mask,
+ * i.e. during cpu online or offline.
+ */
+static struct device_node *cpu_to_l2cache(int cpu)
+{
+	struct device_node *np;
+	struct device_node *cache;
+
+	if (!cpu_present(cpu))
+		return NULL;
+
+	np = of_get_cpu_node(cpu, NULL);
+	if (np == NULL)
+		return NULL;
+
+	cache = of_find_next_cache_node(np);
+
+	of_node_put(np);
+
+	return cache;
+}
+
+static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
+{
+	struct device_node *l2_cache, *np;
+	int i;
+
+	l2_cache = cpu_to_l2cache(cpu);
+	if (!l2_cache)
+		return false;
+
+	for_each_cpu(i, cpu_online_mask) {
+		/*
+		 * when updating the marks the current CPU has not been marked
+		 * online, but we need to update the cache masks
+		 */
+		np = cpu_to_l2cache(i);
+		if (!np)
+			continue;
+
+		if (np == l2_cache)
+			set_cpus_related(cpu, i, mask_fn);
+
+		of_node_put(np);
+	}
+	of_node_put(l2_cache);
+
+	return true;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void remove_cpu_from_masks(int cpu)
+{
+	int i;
+
+	/* NB: cpu_core_mask is a superset of the others */
+	for_each_cpu(i, cpu_core_mask(cpu)) {
+		set_cpus_unrelated(cpu, i, cpu_core_mask);
+		set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
+		set_cpus_unrelated(cpu, i, cpu_sibling_mask);
+	}
+}
+#endif
+
+static void add_cpu_to_masks(int cpu)
+{
+	int first_thread = cpu_first_thread_sibling(cpu);
+	int chipid = cpu_to_chip_id(cpu);
+	int i;
+
+	/*
+	 * This CPU will not be in the online mask yet so we need to manually
+	 * add it to it's own thread sibling mask.
+	 */
+	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+
+	for (i = first_thread; i < first_thread + threads_per_core; i++)
+		if (cpu_online(i))
+			set_cpus_related(i, cpu, cpu_sibling_mask);
+
+	/*
+	 * Copy the thread sibling mask into the cache sibling mask
+	 * and mark any CPUs that share an L2 with this CPU.
+	 */
+	for_each_cpu(i, cpu_sibling_mask(cpu))
+		set_cpus_related(cpu, i, cpu_l2_cache_mask);
+	update_mask_by_l2(cpu, cpu_l2_cache_mask);
+
+	/*
+	 * Copy the cache sibling mask into core sibling mask and mark
+	 * any CPUs on the same chip as this CPU.
+	 */
+	for_each_cpu(i, cpu_l2_cache_mask(cpu))
+		set_cpus_related(cpu, i, cpu_core_mask);
+
+	if (chipid == -1)
+		return;
+
+	for_each_cpu(i, cpu_online_mask)
+		if (cpu_to_chip_id(i) == chipid)
+			set_cpus_related(cpu, i, cpu_core_mask);
+}
+
+static bool shared_caches;
+
+/* Activate a secondary processor. */
+void start_secondary(void *unused)
+{
+	unsigned int cpu = smp_processor_id();
+
+	mmgrab(&init_mm);
+	current->active_mm = &init_mm;
+
+	smp_store_cpu_info(cpu);
+	set_dec(tb_ticks_per_jiffy);
+	preempt_disable();
+	cpu_callin_map[cpu] = 1;
+
+	if (smp_ops->setup_cpu)
+		smp_ops->setup_cpu(cpu);
+	if (smp_ops->take_timebase)
+		smp_ops->take_timebase();
+
+	secondary_cpu_time_init();
+
+#ifdef CONFIG_PPC64
+	if (system_state == SYSTEM_RUNNING)
+		vdso_data->processorCount++;
+
+	vdso_getcpu_init();
+#endif
+	set_numa_node(numa_cpu_lookup_table[cpu]);
+	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
+
+	/* Update topology CPU masks */
+	add_cpu_to_masks(cpu);
+
+	/*
+	 * Check for any shared caches. Note that this must be done on a
+	 * per-core basis because one core in the pair might be disabled.
+	 */
+	if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu)))
+		shared_caches = true;
+
+	smp_wmb();
+	notify_cpu_starting(cpu);
+	set_cpu_online(cpu, true);
+
+	local_irq_enable();
+
+	/* We can enable ftrace for secondary cpus now */
+	this_cpu_enable_ftrace();
+
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+
+	BUG();
+}
+
+#ifdef CONFIG_PROFILING
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymetric SMT dependancy */
+static int powerpc_smt_flags(void)
+{
+	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+		flags |= SD_ASYM_PACKING;
+	}
+	return flags;
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+	return cpu_l2_cache_mask(cpu);
+}
+
+static struct sched_domain_topology_level power9_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+	/*
+	 * We are running pinned to the boot CPU, see rest_init().
+	 */
+	if (smp_ops && smp_ops->setup_cpu)
+		smp_ops->setup_cpu(boot_cpuid);
+
+	if (smp_ops && smp_ops->bringup_done)
+		smp_ops->bringup_done();
+
+	/*
+	 * On a shared LPAR, associativity needs to be requested.
+	 * Hence, get numa topology before dumping cpu topology
+	 */
+	shared_proc_topology_init();
+	dump_numa_cpu_topology();
+
+	/*
+	 * If any CPU detects that it's sharing a cache with another CPU then
+	 * use the deeper topology that is aware of this sharing.
+	 */
+	if (shared_caches) {
+		pr_info("Using shared cache scheduler topology\n");
+		set_sched_topology(power9_topology);
+	} else {
+		pr_info("Using standard scheduler topology\n");
+		set_sched_topology(powerpc_topology);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int __cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+	int err;
+
+	if (!smp_ops->cpu_disable)
+		return -ENOSYS;
+
+	this_cpu_disable_ftrace();
+
+	err = smp_ops->cpu_disable();
+	if (err)
+		return err;
+
+	/* Update sibling maps */
+	remove_cpu_from_masks(cpu);
+
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	if (smp_ops->cpu_die)
+		smp_ops->cpu_die(cpu);
+}
+
+void cpu_die(void)
+{
+	/*
+	 * Disable on the down path. This will be re-enabled by
+	 * start_secondary() via start_secondary_resume() below
+	 */
+	this_cpu_disable_ftrace();
+
+	if (ppc_md.cpu_die)
+		ppc_md.cpu_die();
+
+	/* If we return, we re-enter start_secondary */
+	start_secondary_resume();
+}
+
+#endif
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
new file mode 100644
index 000000000..dadcc8bab
--- /dev/null
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Stack trace utility functions etc.
+ *
+ * Copyright 2008 Christoph Hellwig, IBM Corp.
+ * Copyright 2018 SUSE Linux GmbH
+ * Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp.
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/stacktrace.h>
+#include <asm/ptrace.h>
+#include <asm/processor.h>
+#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <asm/kprobes.h>
+
+#include <asm/paca.h>
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+static void save_context_stack(struct stack_trace *trace, unsigned long sp,
+			struct task_struct *tsk, int savesched)
+{
+	for (;;) {
+		unsigned long *stack = (unsigned long *) sp;
+		unsigned long newsp, ip;
+
+		if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD))
+			return;
+
+		newsp = stack[0];
+		ip = stack[STACK_FRAME_LR_SAVE];
+
+		if (savesched || !in_sched_functions(ip)) {
+			if (!trace->skip)
+				trace->entries[trace->nr_entries++] = ip;
+			else
+				trace->skip--;
+		}
+
+		if (trace->nr_entries >= trace->max_entries)
+			return;
+
+		sp = newsp;
+	}
+}
+
+void save_stack_trace(struct stack_trace *trace)
+{
+	unsigned long sp;
+
+	sp = current_stack_pointer();
+
+	save_context_stack(trace, sp, current, 1);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	unsigned long sp;
+
+	if (tsk == current)
+		sp = current_stack_pointer();
+	else
+		sp = tsk->thread.ksp;
+
+	save_context_stack(trace, sp, tsk, 0);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+void
+save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+	save_context_stack(trace, regs->gpr[1], current, 0);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_regs);
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+int
+save_stack_trace_tsk_reliable(struct task_struct *tsk,
+				struct stack_trace *trace)
+{
+	unsigned long sp;
+	unsigned long stack_page = (unsigned long)task_stack_page(tsk);
+	unsigned long stack_end;
+	int graph_idx = 0;
+
+	/*
+	 * The last frame (unwinding first) may not yet have saved
+	 * its LR onto the stack.
+	 */
+	int firstframe = 1;
+
+	if (tsk == current)
+		sp = current_stack_pointer();
+	else
+		sp = tsk->thread.ksp;
+
+	stack_end = stack_page + THREAD_SIZE;
+	if (!is_idle_task(tsk)) {
+		/*
+		 * For user tasks, this is the SP value loaded on
+		 * kernel entry, see "PACAKSAVE(r13)" in _switch() and
+		 * system_call_common()/EXCEPTION_PROLOG_COMMON().
+		 *
+		 * Likewise for non-swapper kernel threads,
+		 * this also happens to be the top of the stack
+		 * as setup by copy_thread().
+		 *
+		 * Note that stack backlinks are not properly setup by
+		 * copy_thread() and thus, a forked task() will have
+		 * an unreliable stack trace until it's been
+		 * _switch()'ed to for the first time.
+		 */
+		stack_end -= STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
+	} else {
+		/*
+		 * idle tasks have a custom stack layout,
+		 * c.f. cpu_idle_thread_init().
+		 */
+		stack_end -= STACK_FRAME_OVERHEAD;
+	}
+
+	if (sp < stack_page + sizeof(struct thread_struct) ||
+	    sp > stack_end - STACK_FRAME_MIN_SIZE) {
+		return 1;
+	}
+
+	for (;;) {
+		unsigned long *stack = (unsigned long *) sp;
+		unsigned long newsp, ip;
+
+		/* sanity check: ABI requires SP to be aligned 16 bytes. */
+		if (sp & 0xF)
+			return 1;
+
+		/* Mark stacktraces with exception frames as unreliable. */
+		if (sp <= stack_end - STACK_INT_FRAME_SIZE &&
+		    stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+			return 1;
+		}
+
+		newsp = stack[0];
+		/* Stack grows downwards; unwinder may only go up. */
+		if (newsp <= sp)
+			return 1;
+
+		if (newsp != stack_end &&
+		    newsp > stack_end - STACK_FRAME_MIN_SIZE) {
+			return 1; /* invalid backlink, too far up. */
+		}
+
+		/* Examine the saved LR: it must point into kernel code. */
+		ip = stack[STACK_FRAME_LR_SAVE];
+		if (!firstframe && !__kernel_text_address(ip))
+			return 1;
+		firstframe = 0;
+
+		/*
+		 * FIXME: IMHO these tests do not belong in
+		 * arch-dependent code, they are generic.
+		 */
+		ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL);
+#ifdef CONFIG_KPROBES
+		/*
+		 * Mark stacktraces with kretprobed functions on them
+		 * as unreliable.
+		 */
+		if (ip == (unsigned long)kretprobe_trampoline)
+			return 1;
+#endif
+
+		if (!trace->skip)
+			trace->entries[trace->nr_entries++] = ip;
+		else
+			trace->skip--;
+
+		if (newsp == stack_end)
+			break;
+
+		if (trace->nr_entries >= trace->max_entries)
+			return -E2BIG;
+
+		sp = newsp;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable);
+#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
+
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI)
+static void handle_backtrace_ipi(struct pt_regs *regs)
+{
+	nmi_cpu_backtrace(regs);
+}
+
+static void raise_backtrace_ipi(cpumask_t *mask)
+{
+	struct paca_struct *p;
+	unsigned int cpu;
+	u64 delay_us;
+
+	for_each_cpu(cpu, mask) {
+		if (cpu == smp_processor_id()) {
+			handle_backtrace_ipi(NULL);
+			continue;
+		}
+
+		delay_us = 5 * USEC_PER_SEC;
+
+		if (smp_send_safe_nmi_ipi(cpu, handle_backtrace_ipi, delay_us)) {
+			// Now wait up to 5s for the other CPU to do its backtrace
+			while (cpumask_test_cpu(cpu, mask) && delay_us) {
+				udelay(1);
+				delay_us--;
+			}
+
+			// Other CPU cleared itself from the mask
+			if (delay_us)
+				continue;
+		}
+
+		p = paca_ptrs[cpu];
+
+		cpumask_clear_cpu(cpu, mask);
+
+		pr_warn("CPU %d didn't respond to backtrace IPI, inspecting paca.\n", cpu);
+		if (!virt_addr_valid(p)) {
+			pr_warn("paca pointer appears corrupt? (%px)\n", p);
+			continue;
+		}
+
+		pr_warn("irq_soft_mask: 0x%02x in_mce: %d in_nmi: %d",
+			p->irq_soft_mask, p->in_mce, p->in_nmi);
+
+		if (virt_addr_valid(p->__current))
+			pr_cont(" current: %d (%s)\n", p->__current->pid,
+				p->__current->comm);
+		else
+			pr_cont(" current pointer corrupt? (%px)\n", p->__current);
+
+		pr_warn("Back trace of paca->saved_r1 (0x%016llx) (possibly stale):\n", p->saved_r1);
+		show_stack(p->__current, (unsigned long *)p->saved_r1);
+	}
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+	nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi);
+}
+#endif /* defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) */
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
new file mode 100644
index 000000000..a531154cc
--- /dev/null
+++ b/arch/powerpc/kernel/suspend.c
@@ -0,0 +1,23 @@
+/*
+ * Suspend support specific for power.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/sections.h>
+
+/*
+ *	pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
new file mode 100644
index 000000000..0050b2d2f
--- /dev/null
+++ b/arch/powerpc/kernel/swsusp.c
@@ -0,0 +1,37 @@
+/*
+ * Common powerpc suspend code for 32 and 64 bits
+ *
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/suspend.h>
+#include <asm/current.h>
+#include <asm/mmu_context.h>
+#include <asm/switch_to.h>
+
+void save_processor_state(void)
+{
+	/*
+	 * flush out all the special registers so we don't need
+	 * to save them in the snapshot
+	 */
+	flush_all_to_thread(current);
+
+#ifdef CONFIG_PPC64
+	hard_irq_disable();
+#endif
+
+}
+
+void restore_processor_state(void)
+{
+#ifdef CONFIG_PPC32
+	switch_mmu_context(current->active_mm, current->active_mm, NULL);
+#endif
+}
diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S
new file mode 100644
index 000000000..54c44aea3
--- /dev/null
+++ b/arch/powerpc/kernel/swsusp_32.S
@@ -0,0 +1,409 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/mmu.h>
+#include <asm/feature-fixups.h>
+
+/*
+ * Structure for storing CPU registers on the save area.
+ */
+#define SL_SP		0
+#define SL_PC		4
+#define SL_MSR		8
+#define SL_SDR1		0xc
+#define SL_SPRG0	0x10	/* 4 sprg's */
+#define SL_DBAT0	0x20
+#define SL_IBAT0	0x28
+#define SL_DBAT1	0x30
+#define SL_IBAT1	0x38
+#define SL_DBAT2	0x40
+#define SL_IBAT2	0x48
+#define SL_DBAT3	0x50
+#define SL_IBAT3	0x58
+#define SL_DBAT4	0x60
+#define SL_IBAT4	0x68
+#define SL_DBAT5	0x70
+#define SL_IBAT5	0x78
+#define SL_DBAT6	0x80
+#define SL_IBAT6	0x88
+#define SL_DBAT7	0x90
+#define SL_IBAT7	0x98
+#define SL_TB		0xa0
+#define SL_R2		0xa8
+#define SL_CR		0xac
+#define SL_LR		0xb0
+#define SL_R12		0xb4	/* r12 to r31 */
+#define SL_SIZE		(SL_R12 + 80)
+
+	.section .data
+	.align	5
+
+_GLOBAL(swsusp_save_area)
+	.space	SL_SIZE
+
+
+	.section .text
+	.align	5
+
+_GLOBAL(swsusp_arch_suspend)
+
+	lis	r11,swsusp_save_area@h
+	ori	r11,r11,swsusp_save_area@l
+
+	mflr	r0
+	stw	r0,SL_LR(r11)
+	mfcr	r0
+	stw	r0,SL_CR(r11)
+	stw	r1,SL_SP(r11)
+	stw	r2,SL_R2(r11)
+	stmw	r12,SL_R12(r11)
+
+	/* Save MSR & SDR1 */
+	mfmsr	r4
+	stw	r4,SL_MSR(r11)
+	mfsdr1	r4
+	stw	r4,SL_SDR1(r11)
+
+	/* Get a stable timebase and save it */
+1:	mftbu	r4
+	stw	r4,SL_TB(r11)
+	mftb	r5
+	stw	r5,SL_TB+4(r11)
+	mftbu	r3
+	cmpw	r3,r4
+	bne	1b
+
+	/* Save SPRGs */
+	mfsprg	r4,0
+	stw	r4,SL_SPRG0(r11)
+	mfsprg	r4,1
+	stw	r4,SL_SPRG0+4(r11)
+	mfsprg	r4,2
+	stw	r4,SL_SPRG0+8(r11)
+	mfsprg	r4,3
+	stw	r4,SL_SPRG0+12(r11)
+
+	/* Save BATs */
+	mfdbatu	r4,0
+	stw	r4,SL_DBAT0(r11)
+	mfdbatl	r4,0
+	stw	r4,SL_DBAT0+4(r11)
+	mfdbatu	r4,1
+	stw	r4,SL_DBAT1(r11)
+	mfdbatl	r4,1
+	stw	r4,SL_DBAT1+4(r11)
+	mfdbatu	r4,2
+	stw	r4,SL_DBAT2(r11)
+	mfdbatl	r4,2
+	stw	r4,SL_DBAT2+4(r11)
+	mfdbatu	r4,3
+	stw	r4,SL_DBAT3(r11)
+	mfdbatl	r4,3
+	stw	r4,SL_DBAT3+4(r11)
+	mfibatu	r4,0
+	stw	r4,SL_IBAT0(r11)
+	mfibatl	r4,0
+	stw	r4,SL_IBAT0+4(r11)
+	mfibatu	r4,1
+	stw	r4,SL_IBAT1(r11)
+	mfibatl	r4,1
+	stw	r4,SL_IBAT1+4(r11)
+	mfibatu	r4,2
+	stw	r4,SL_IBAT2(r11)
+	mfibatl	r4,2
+	stw	r4,SL_IBAT2+4(r11)
+	mfibatu	r4,3
+	stw	r4,SL_IBAT3(r11)
+	mfibatl	r4,3
+	stw	r4,SL_IBAT3+4(r11)
+
+BEGIN_MMU_FTR_SECTION
+	mfspr	r4,SPRN_DBAT4U
+	stw	r4,SL_DBAT4(r11)
+	mfspr	r4,SPRN_DBAT4L
+	stw	r4,SL_DBAT4+4(r11)
+	mfspr	r4,SPRN_DBAT5U
+	stw	r4,SL_DBAT5(r11)
+	mfspr	r4,SPRN_DBAT5L
+	stw	r4,SL_DBAT5+4(r11)
+	mfspr	r4,SPRN_DBAT6U
+	stw	r4,SL_DBAT6(r11)
+	mfspr	r4,SPRN_DBAT6L
+	stw	r4,SL_DBAT6+4(r11)
+	mfspr	r4,SPRN_DBAT7U
+	stw	r4,SL_DBAT7(r11)
+	mfspr	r4,SPRN_DBAT7L
+	stw	r4,SL_DBAT7+4(r11)
+	mfspr	r4,SPRN_IBAT4U
+	stw	r4,SL_IBAT4(r11)
+	mfspr	r4,SPRN_IBAT4L
+	stw	r4,SL_IBAT4+4(r11)
+	mfspr	r4,SPRN_IBAT5U
+	stw	r4,SL_IBAT5(r11)
+	mfspr	r4,SPRN_IBAT5L
+	stw	r4,SL_IBAT5+4(r11)
+	mfspr	r4,SPRN_IBAT6U
+	stw	r4,SL_IBAT6(r11)
+	mfspr	r4,SPRN_IBAT6L
+	stw	r4,SL_IBAT6+4(r11)
+	mfspr	r4,SPRN_IBAT7U
+	stw	r4,SL_IBAT7(r11)
+	mfspr	r4,SPRN_IBAT7L
+	stw	r4,SL_IBAT7+4(r11)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+
+#if  0
+	/* Backup various CPU config stuffs */
+	bl	__save_cpu_setup
+#endif
+	/* Call the low level suspend stuff (we should probably have made
+	 * a stackframe...
+	 */
+	bl	swsusp_save
+
+	/* Restore LR from the save area */
+	lis	r11,swsusp_save_area@h
+	ori	r11,r11,swsusp_save_area@l
+	lwz	r0,SL_LR(r11)
+	mtlr	r0
+
+	blr
+
+
+/* Resume code */
+_GLOBAL(swsusp_arch_resume)
+
+#ifdef CONFIG_ALTIVEC
+	/* Stop pending alitvec streams and memory accesses */
+BEGIN_FTR_SECTION
+	PPC_DSSALL
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+ 	sync
+
+	/* Disable MSR:DR to make sure we don't take a TLB or
+	 * hash miss during the copy, as our hash table will
+	 * for a while be unusable. For .text, we assume we are
+	 * covered by a BAT. This works only for non-G5 at this
+	 * point. G5 will need a better approach, possibly using
+	 * a small temporary hash table filled with large mappings,
+	 * disabling the MMU completely isn't a good option for
+	 * performance reasons.
+	 * (Note that 750's may have the same performance issue as
+	 * the G5 in this case, we should investigate using moving
+	 * BATs for these CPUs)
+	 */
+	mfmsr	r0
+	sync
+	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
+	mtmsr	r0
+	sync
+	isync
+
+	/* Load ptr the list of pages to copy in r3 */
+	lis	r11,(restore_pblist - KERNELBASE)@h
+	ori	r11,r11,restore_pblist@l
+	lwz	r10,0(r11)
+
+	/* Copy the pages. This is a very basic implementation, to
+	 * be replaced by something more cache efficient */
+1:
+	tophys(r3,r10)
+	li	r0,256
+	mtctr	r0
+	lwz	r11,pbe_address(r3)	/* source */
+	tophys(r5,r11)
+	lwz	r10,pbe_orig_address(r3)	/* destination */
+	tophys(r6,r10)
+2:
+	lwz	r8,0(r5)
+	lwz	r9,4(r5)
+	lwz	r10,8(r5)
+	lwz	r11,12(r5)
+	addi	r5,r5,16
+	stw	r8,0(r6)
+	stw	r9,4(r6)
+	stw	r10,8(r6)
+	stw	r11,12(r6)
+	addi	r6,r6,16
+	bdnz	2b
+	lwz		r10,pbe_next(r3)
+	cmpwi	0,r10,0
+	bne	1b
+
+	/* Do a very simple cache flush/inval of the L1 to ensure
+	 * coherency of the icache
+	 */
+	lis	r3,0x0002
+	mtctr	r3
+	li	r3, 0
+1:
+	lwz	r0,0(r3)
+	addi	r3,r3,0x0020
+	bdnz	1b
+	isync
+	sync
+
+	/* Now flush those cache lines */
+	lis	r3,0x0002
+	mtctr	r3
+	li	r3, 0
+1:
+	dcbf	0,r3
+	addi	r3,r3,0x0020
+	bdnz	1b
+	sync
+
+	/* Ok, we are now running with the kernel data of the old
+	 * kernel fully restored. We can get to the save area
+	 * easily now. As for the rest of the code, it assumes the
+	 * loader kernel and the booted one are exactly identical
+	 */
+	lis	r11,swsusp_save_area@h
+	ori	r11,r11,swsusp_save_area@l
+	tophys(r11,r11)
+
+#if 0
+	/* Restore various CPU config stuffs */
+	bl	__restore_cpu_setup
+#endif
+	/* Restore the BATs, and SDR1.  Then we can turn on the MMU.
+	 * This is a bit hairy as we are running out of those BATs,
+	 * but first, our code is probably in the icache, and we are
+	 * writing the same value to the BAT, so that should be fine,
+	 * though a better solution will have to be found long-term
+	 */
+	lwz	r4,SL_SDR1(r11)
+	mtsdr1	r4
+	lwz	r4,SL_SPRG0(r11)
+	mtsprg	0,r4
+	lwz	r4,SL_SPRG0+4(r11)
+	mtsprg	1,r4
+	lwz	r4,SL_SPRG0+8(r11)
+	mtsprg	2,r4
+	lwz	r4,SL_SPRG0+12(r11)
+	mtsprg	3,r4
+
+#if 0
+	lwz	r4,SL_DBAT0(r11)
+	mtdbatu	0,r4
+	lwz	r4,SL_DBAT0+4(r11)
+	mtdbatl	0,r4
+	lwz	r4,SL_DBAT1(r11)
+	mtdbatu	1,r4
+	lwz	r4,SL_DBAT1+4(r11)
+	mtdbatl	1,r4
+	lwz	r4,SL_DBAT2(r11)
+	mtdbatu	2,r4
+	lwz	r4,SL_DBAT2+4(r11)
+	mtdbatl	2,r4
+	lwz	r4,SL_DBAT3(r11)
+	mtdbatu	3,r4
+	lwz	r4,SL_DBAT3+4(r11)
+	mtdbatl	3,r4
+	lwz	r4,SL_IBAT0(r11)
+	mtibatu	0,r4
+	lwz	r4,SL_IBAT0+4(r11)
+	mtibatl	0,r4
+	lwz	r4,SL_IBAT1(r11)
+	mtibatu	1,r4
+	lwz	r4,SL_IBAT1+4(r11)
+	mtibatl	1,r4
+	lwz	r4,SL_IBAT2(r11)
+	mtibatu	2,r4
+	lwz	r4,SL_IBAT2+4(r11)
+	mtibatl	2,r4
+	lwz	r4,SL_IBAT3(r11)
+	mtibatu	3,r4
+	lwz	r4,SL_IBAT3+4(r11)
+	mtibatl	3,r4
+BEGIN_MMU_FTR_SECTION
+	lwz	r4,SL_DBAT4(r11)
+	mtspr	SPRN_DBAT4U,r4
+	lwz	r4,SL_DBAT4+4(r11)
+	mtspr	SPRN_DBAT4L,r4
+	lwz	r4,SL_DBAT5(r11)
+	mtspr	SPRN_DBAT5U,r4
+	lwz	r4,SL_DBAT5+4(r11)
+	mtspr	SPRN_DBAT5L,r4
+	lwz	r4,SL_DBAT6(r11)
+	mtspr	SPRN_DBAT6U,r4
+	lwz	r4,SL_DBAT6+4(r11)
+	mtspr	SPRN_DBAT6L,r4
+	lwz	r4,SL_DBAT7(r11)
+	mtspr	SPRN_DBAT7U,r4
+	lwz	r4,SL_DBAT7+4(r11)
+	mtspr	SPRN_DBAT7L,r4
+	lwz	r4,SL_IBAT4(r11)
+	mtspr	SPRN_IBAT4U,r4
+	lwz	r4,SL_IBAT4+4(r11)
+	mtspr	SPRN_IBAT4L,r4
+	lwz	r4,SL_IBAT5(r11)
+	mtspr	SPRN_IBAT5U,r4
+	lwz	r4,SL_IBAT5+4(r11)
+	mtspr	SPRN_IBAT5L,r4
+	lwz	r4,SL_IBAT6(r11)
+	mtspr	SPRN_IBAT6U,r4
+	lwz	r4,SL_IBAT6+4(r11)
+	mtspr	SPRN_IBAT6L,r4
+	lwz	r4,SL_IBAT7(r11)
+	mtspr	SPRN_IBAT7U,r4
+	lwz	r4,SL_IBAT7+4(r11)
+	mtspr	SPRN_IBAT7L,r4
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+#endif
+
+	/* Flush all TLBs */
+	lis	r4,0x1000
+1:	addic.	r4,r4,-0x1000
+	tlbie	r4
+	bgt	1b
+	sync
+
+	/* restore the MSR and turn on the MMU */
+	lwz	r3,SL_MSR(r11)
+	bl	turn_on_mmu
+	tovirt(r11,r11)
+
+	/* Restore TB */
+	li	r3,0
+	mttbl	r3
+	lwz	r3,SL_TB(r11)
+	lwz	r4,SL_TB+4(r11)
+	mttbu	r3
+	mttbl	r4
+
+	/* Kick decrementer */
+	li	r0,1
+	mtdec	r0
+
+	/* Restore the callee-saved registers and return */
+	lwz	r0,SL_CR(r11)
+	mtcr	r0
+	lwz	r2,SL_R2(r11)
+	lmw	r12,SL_R12(r11)
+	lwz	r1,SL_SP(r11)
+	lwz	r0,SL_LR(r11)
+	mtlr	r0
+
+	// XXX Note: we don't really need to call swsusp_resume
+
+	li	r3,0
+	blr
+
+/* FIXME:This construct is actually not useful since we don't shut
+ * down the instruction MMU, we could just flip back MSR-DR on.
+ */
+turn_on_mmu:
+	mflr	r4
+	mtsrr0	r4
+	mtsrr1	r3
+	sync
+	isync
+	rfi
+
diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c
new file mode 100644
index 000000000..51db01280
--- /dev/null
+++ b/arch/powerpc/kernel/swsusp_64.c
@@ -0,0 +1,25 @@
+/*
+ * PowerPC 64-bit swsusp implementation
+ *
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * GPLv2
+ */
+
+#include <asm/iommu.h>
+#include <linux/irq.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+
+void do_after_copyback(void)
+{
+	iommu_restore();
+	touch_softlockup_watchdog();
+	mb();
+}
+
+void _iommu_save(void)
+{
+	iommu_save();
+}
diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S
new file mode 100644
index 000000000..0af06f3db
--- /dev/null
+++ b/arch/powerpc/kernel/swsusp_asm64.S
@@ -0,0 +1,274 @@
+/*
+ * PowerPC 64-bit swsusp implementation
+ *
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * GPLv2
+ */
+
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/feature-fixups.h>
+
+/*
+ * Structure for storing CPU registers on the save area.
+ */
+#define SL_r1		0x00	/* stack pointer */
+#define SL_PC		0x08
+#define SL_MSR		0x10
+#define SL_SDR1		0x18
+#define SL_XER		0x20
+#define SL_TB		0x40
+#define SL_r2		0x48
+#define SL_CR		0x50
+#define SL_LR		0x58
+#define SL_r12		0x60
+#define SL_r13		0x68
+#define SL_r14		0x70
+#define SL_r15		0x78
+#define SL_r16		0x80
+#define SL_r17		0x88
+#define SL_r18		0x90
+#define SL_r19		0x98
+#define SL_r20		0xa0
+#define SL_r21		0xa8
+#define SL_r22		0xb0
+#define SL_r23		0xb8
+#define SL_r24		0xc0
+#define SL_r25		0xc8
+#define SL_r26		0xd0
+#define SL_r27		0xd8
+#define SL_r28		0xe0
+#define SL_r29		0xe8
+#define SL_r30		0xf0
+#define SL_r31		0xf8
+#define SL_SPRG1	0x100
+#define SL_TCR		0x108
+#define SL_SIZE		SL_TCR+8
+
+/* these macros rely on the save area being
+ * pointed to by r11 */
+
+#define SAVE_SPR(register)		\
+	mfspr	r0, SPRN_##register	;\
+	std	r0, SL_##register(r11)
+#define RESTORE_SPR(register)		\
+	ld	r0, SL_##register(r11)	;\
+	mtspr	SPRN_##register, r0
+#define SAVE_SPECIAL(special)		\
+	mf##special	r0		;\
+	std	r0, SL_##special(r11)
+#define RESTORE_SPECIAL(special)	\
+	ld	r0, SL_##special(r11)	;\
+	mt##special	r0
+#define SAVE_REGISTER(reg)		\
+	std	reg, SL_##reg(r11)
+#define RESTORE_REGISTER(reg)		\
+	ld	reg, SL_##reg(r11)
+
+/* space for storing cpu state */
+	.section .data
+	.align  5
+swsusp_save_area:
+	.space SL_SIZE
+
+	.section ".toc","aw"
+swsusp_save_area_ptr:
+	.tc	swsusp_save_area[TC],swsusp_save_area
+restore_pblist_ptr:
+	.tc	restore_pblist[TC],restore_pblist
+
+	.section .text
+	.align  5
+_GLOBAL(swsusp_arch_suspend)
+	ld	r11,swsusp_save_area_ptr@toc(r2)
+	SAVE_SPECIAL(LR)
+	SAVE_REGISTER(r1)
+	SAVE_SPECIAL(CR)
+	SAVE_SPECIAL(TB)
+	SAVE_REGISTER(r2)
+	SAVE_REGISTER(r12)
+	SAVE_REGISTER(r13)
+	SAVE_REGISTER(r14)
+	SAVE_REGISTER(r15)
+	SAVE_REGISTER(r16)
+	SAVE_REGISTER(r17)
+	SAVE_REGISTER(r18)
+	SAVE_REGISTER(r19)
+	SAVE_REGISTER(r20)
+	SAVE_REGISTER(r21)
+	SAVE_REGISTER(r22)
+	SAVE_REGISTER(r23)
+	SAVE_REGISTER(r24)
+	SAVE_REGISTER(r25)
+	SAVE_REGISTER(r26)
+	SAVE_REGISTER(r27)
+	SAVE_REGISTER(r28)
+	SAVE_REGISTER(r29)
+	SAVE_REGISTER(r30)
+	SAVE_REGISTER(r31)
+	SAVE_SPECIAL(MSR)
+	SAVE_SPECIAL(XER)
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FW_FTR_SECTION
+	SAVE_SPECIAL(SDR1)
+END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR)
+#else
+	SAVE_SPR(TCR)
+
+	/* Save SPRG1, SPRG1 be used save paca */
+	SAVE_SPR(SPRG1)
+#endif
+
+	/* we push the stack up 128 bytes but don't store the
+	 * stack pointer on the stack like a real stackframe */
+	addi	r1,r1,-128
+
+	bl _iommu_save
+	bl swsusp_save
+
+	/* restore LR */
+	ld	r11,swsusp_save_area_ptr@toc(r2)
+	RESTORE_SPECIAL(LR)
+	addi	r1,r1,128
+
+	blr
+
+/* Resume code */
+_GLOBAL(swsusp_arch_resume)
+	/* Stop pending alitvec streams and memory accesses */
+BEGIN_FTR_SECTION
+	PPC_DSSALL
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+	sync
+
+	ld	r12,restore_pblist_ptr@toc(r2)
+	ld	r12,0(r12)
+
+	cmpdi	r12,0
+	beq-	nothing_to_copy
+	li	r15,PAGE_SIZE>>3
+copyloop:
+	ld	r13,pbe_address(r12)
+	ld	r14,pbe_orig_address(r12)
+
+	mtctr	r15
+	li	r10,0
+copy_page_loop:
+	ldx	r0,r10,r13
+	stdx	r0,r10,r14
+	addi	r10,r10,8
+	bdnz copy_page_loop
+
+	ld	r12,pbe_next(r12)
+	cmpdi	r12,0
+	bne+	copyloop
+nothing_to_copy:
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* flush caches */
+	lis	r3, 0x10
+	mtctr	r3
+	li	r3, 0
+	ori	r3, r3, CONFIG_KERNEL_START>>48
+	li	r0, 48
+	sld	r3, r3, r0
+	li	r0, 0
+1:
+	dcbf	0,r3
+	addi	r3,r3,0x20
+	bdnz	1b
+
+	sync
+
+	tlbia
+#endif
+
+	ld	r11,swsusp_save_area_ptr@toc(r2)
+
+	RESTORE_SPECIAL(CR)
+
+	/* restore timebase */
+	/* load saved tb */
+	ld	r1, SL_TB(r11)
+	/* get upper 32 bits of it */
+	srdi	r2, r1, 32
+	/* clear tb lower to avoid wrap */
+	li	r0, 0
+	mttbl	r0
+	/* set tb upper */
+	mttbu	r2
+	/* set tb lower */
+	mttbl	r1
+
+	/* restore registers */
+	RESTORE_REGISTER(r1)
+	RESTORE_REGISTER(r2)
+	RESTORE_REGISTER(r12)
+	RESTORE_REGISTER(r13)
+	RESTORE_REGISTER(r14)
+	RESTORE_REGISTER(r15)
+	RESTORE_REGISTER(r16)
+	RESTORE_REGISTER(r17)
+	RESTORE_REGISTER(r18)
+	RESTORE_REGISTER(r19)
+	RESTORE_REGISTER(r20)
+	RESTORE_REGISTER(r21)
+	RESTORE_REGISTER(r22)
+	RESTORE_REGISTER(r23)
+	RESTORE_REGISTER(r24)
+	RESTORE_REGISTER(r25)
+	RESTORE_REGISTER(r26)
+	RESTORE_REGISTER(r27)
+	RESTORE_REGISTER(r28)
+	RESTORE_REGISTER(r29)
+	RESTORE_REGISTER(r30)
+	RESTORE_REGISTER(r31)
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* can't use RESTORE_SPECIAL(MSR) */
+	ld	r0, SL_MSR(r11)
+	mtmsrd	r0, 0
+BEGIN_FW_FTR_SECTION
+	RESTORE_SPECIAL(SDR1)
+END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR)
+#else
+	/* Restore SPRG1, be used to save paca */
+	ld	r0, SL_SPRG1(r11)
+	mtsprg	1, r0
+
+	RESTORE_SPECIAL(MSR)
+
+	/* Restore TCR and clear any pending bits in TSR. */
+	RESTORE_SPR(TCR)
+	lis	r0, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h
+	mtspr	SPRN_TSR, r0
+
+	/* Kick decrementer */
+	li	r0, 1
+	mtdec	r0
+
+	/* Invalidate all tlbs */
+	bl	_tlbil_all
+#endif
+	RESTORE_SPECIAL(XER)
+
+	sync
+
+	addi	r1,r1,-128
+#ifdef CONFIG_PPC_BOOK3S_64
+	bl	slb_flush_and_rebolt
+#endif
+	bl	do_after_copyback
+	addi	r1,r1,128
+
+	ld	r11,swsusp_save_area_ptr@toc(r2)
+	RESTORE_SPECIAL(LR)
+
+	li	r3, 0
+	blr
diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S
new file mode 100644
index 000000000..88cfdbd53
--- /dev/null
+++ b/arch/powerpc/kernel/swsusp_booke.S
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Based on swsusp_32.S, modified for FSL BookE by
+ * Anton Vorontsov <avorontsov@ru.mvista.com>
+ * Copyright (c) 2009-2010 MontaVista Software, LLC.
+ */
+
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/mmu.h>
+
+/*
+ * Structure for storing CPU registers on the save area.
+ */
+#define SL_SP		0
+#define SL_PC		4
+#define SL_MSR		8
+#define SL_TCR		0xc
+#define SL_SPRG0	0x10
+#define SL_SPRG1	0x14
+#define SL_SPRG2	0x18
+#define SL_SPRG3	0x1c
+#define SL_SPRG4	0x20
+#define SL_SPRG5	0x24
+#define SL_SPRG6	0x28
+#define SL_SPRG7	0x2c
+#define SL_TBU		0x30
+#define SL_TBL		0x34
+#define SL_R2		0x38
+#define SL_CR		0x3c
+#define SL_LR		0x40
+#define SL_R12		0x44	/* r12 to r31 */
+#define SL_SIZE		(SL_R12 + 80)
+
+	.section .data
+	.align	5
+
+_GLOBAL(swsusp_save_area)
+	.space	SL_SIZE
+
+
+	.section .text
+	.align	5
+
+_GLOBAL(swsusp_arch_suspend)
+	lis	r11,swsusp_save_area@h
+	ori	r11,r11,swsusp_save_area@l
+
+	mflr	r0
+	stw	r0,SL_LR(r11)
+	mfcr	r0
+	stw	r0,SL_CR(r11)
+	stw	r1,SL_SP(r11)
+	stw	r2,SL_R2(r11)
+	stmw	r12,SL_R12(r11)
+
+	/* Save MSR & TCR */
+	mfmsr	r4
+	stw	r4,SL_MSR(r11)
+	mfspr	r4,SPRN_TCR
+	stw	r4,SL_TCR(r11)
+
+	/* Get a stable timebase and save it */
+1:	mfspr	r4,SPRN_TBRU
+	stw	r4,SL_TBU(r11)
+	mfspr	r5,SPRN_TBRL
+	stw	r5,SL_TBL(r11)
+	mfspr	r3,SPRN_TBRU
+	cmpw	r3,r4
+	bne	1b
+
+	/* Save SPRGs */
+	mfspr	r4,SPRN_SPRG0
+	stw	r4,SL_SPRG0(r11)
+	mfspr	r4,SPRN_SPRG1
+	stw	r4,SL_SPRG1(r11)
+	mfspr	r4,SPRN_SPRG2
+	stw	r4,SL_SPRG2(r11)
+	mfspr	r4,SPRN_SPRG3
+	stw	r4,SL_SPRG3(r11)
+	mfspr	r4,SPRN_SPRG4
+	stw	r4,SL_SPRG4(r11)
+	mfspr	r4,SPRN_SPRG5
+	stw	r4,SL_SPRG5(r11)
+	mfspr	r4,SPRN_SPRG6
+	stw	r4,SL_SPRG6(r11)
+	mfspr	r4,SPRN_SPRG7
+	stw	r4,SL_SPRG7(r11)
+
+	/* Call the low level suspend stuff (we should probably have made
+	 * a stackframe...
+	 */
+	bl	swsusp_save
+
+	/* Restore LR from the save area */
+	lis	r11,swsusp_save_area@h
+	ori	r11,r11,swsusp_save_area@l
+	lwz	r0,SL_LR(r11)
+	mtlr	r0
+
+	blr
+
+_GLOBAL(swsusp_arch_resume)
+	sync
+
+	/* Load ptr the list of pages to copy in r3 */
+	lis	r11,(restore_pblist)@h
+	ori	r11,r11,restore_pblist@l
+	lwz	r3,0(r11)
+
+	/* Copy the pages. This is a very basic implementation, to
+	 * be replaced by something more cache efficient */
+1:
+	li	r0,256
+	mtctr	r0
+	lwz	r5,pbe_address(r3)	/* source */
+	lwz	r6,pbe_orig_address(r3)	/* destination */
+2:
+	lwz	r8,0(r5)
+	lwz	r9,4(r5)
+	lwz	r10,8(r5)
+	lwz	r11,12(r5)
+	addi	r5,r5,16
+	stw	r8,0(r6)
+	stw	r9,4(r6)
+	stw	r10,8(r6)
+	stw	r11,12(r6)
+	addi	r6,r6,16
+	bdnz	2b
+	lwz	r3,pbe_next(r3)
+	cmpwi	0,r3,0
+	bne	1b
+
+	bl flush_dcache_L1
+	bl flush_instruction_cache
+
+	lis	r11,swsusp_save_area@h
+	ori	r11,r11,swsusp_save_area@l
+
+	/*
+	 * Mappings from virtual addresses to physical addresses may be
+	 * different than they were prior to restoring hibernation state. 
+	 * Invalidate the TLB so that the boot CPU is using the new
+	 * mappings.
+	 */
+	bl	_tlbil_all
+
+	lwz	r4,SL_SPRG0(r11)
+	mtspr	SPRN_SPRG0,r4
+	lwz	r4,SL_SPRG1(r11)
+	mtspr	SPRN_SPRG1,r4
+	lwz	r4,SL_SPRG2(r11)
+	mtspr	SPRN_SPRG2,r4
+	lwz	r4,SL_SPRG3(r11)
+	mtspr	SPRN_SPRG3,r4
+	lwz	r4,SL_SPRG4(r11)
+	mtspr	SPRN_SPRG4,r4
+	lwz	r4,SL_SPRG5(r11)
+	mtspr	SPRN_SPRG5,r4
+	lwz	r4,SL_SPRG6(r11)
+	mtspr	SPRN_SPRG6,r4
+	lwz	r4,SL_SPRG7(r11)
+	mtspr	SPRN_SPRG7,r4
+
+	/* restore the MSR */
+	lwz	r3,SL_MSR(r11)
+	mtmsr	r3
+
+	/* Restore TB */
+	li	r3,0
+	mtspr	SPRN_TBWL,r3
+	lwz	r3,SL_TBU(r11)
+	lwz	r4,SL_TBL(r11)
+	mtspr	SPRN_TBWU,r3
+	mtspr	SPRN_TBWL,r4
+
+	/* Restore TCR and clear any pending bits in TSR. */
+	lwz	r4,SL_TCR(r11)
+	mtspr	SPRN_TCR,r4
+	lis	r4, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h
+	mtspr	SPRN_TSR,r4
+
+	/* Kick decrementer */
+	li	r0,1
+	mtdec	r0
+
+	/* Restore the callee-saved registers and return */
+	lwz	r0,SL_CR(r11)
+	mtcr	r0
+	lwz	r2,SL_R2(r11)
+	lmw	r12,SL_R12(r11)
+	lwz	r1,SL_SP(r11)
+	lwz	r0,SL_LR(r11)
+	mtlr	r0
+
+	li	r3,0
+	blr
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
new file mode 100644
index 000000000..bdf58ba1a
--- /dev/null
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -0,0 +1,119 @@
+/*
+ * sys_ppc32.c: Conversion between 32bit and 64bit native syscalls.
+ *
+ * Copyright (C) 2001 IBM
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h> 
+#include <linux/mm.h> 
+#include <linux/file.h> 
+#include <linux/signal.h>
+#include <linux/resource.h>
+#include <linux/times.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/in.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/sysctl.h>
+#include <linux/binfmts.h>
+#include <linux/security.h>
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <linux/elf.h>
+#include <linux/ipc.h>
+#include <linux/slab.h>
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/time.h>
+#include <asm/mmu_context.h>
+#include <asm/ppc-pci.h>
+#include <asm/syscalls.h>
+#include <asm/switch_to.h>
+
+unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, unsigned long pgoff)
+{
+	/* This should remain 12 even if PAGE_SIZE changes */
+	return sys_mmap(addr, len, prot, flags, fd, pgoff << 12);
+}
+
+/* 
+ * long long munging:
+ * The 32 bit ABI passes long longs in an odd even register pair.
+ */
+
+compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count,
+			     u32 reg6, u32 poshi, u32 poslo)
+{
+	return ksys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
+}
+
+compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count,
+			      u32 reg6, u32 poshi, u32 poslo)
+{
+	return ksys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
+}
+
+compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count)
+{
+	return ksys_readahead(fd, ((loff_t)offhi << 32) | offlo, count);
+}
+
+asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4,
+				unsigned long high, unsigned long low)
+{
+	return ksys_truncate(path, (high << 32) | low);
+}
+
+asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo,
+				     u32 lenhi, u32 lenlo)
+{
+	return ksys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo,
+			     ((loff_t)lenhi << 32) | lenlo);
+}
+
+asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
+				 unsigned long low)
+{
+	return ksys_ftruncate(fd, (high << 32) | low);
+}
+
+long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
+		     size_t len, int advice)
+{
+	return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, len,
+				 advice);
+}
+
+asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
+				   unsigned offset_hi, unsigned offset_lo,
+				   unsigned nbytes_hi, unsigned nbytes_lo)
+{
+	loff_t offset = ((loff_t)offset_hi << 32) | offset_lo;
+	loff_t nbytes = ((loff_t)nbytes_hi << 32) | nbytes_lo;
+
+	return ksys_sync_file_range(fd, offset, nbytes, flags);
+}
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
new file mode 100644
index 000000000..466216506
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls.c
@@ -0,0 +1,141 @@
+/*
+ *  Implementation of various system calls for Linux/PowerPC
+ *
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Derived from "arch/i386/kernel/sys_i386.c"
+ * Adapted from the i386 version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras (paulus@cs.anu.edu.au).
+ *
+ * This file contains various random system calls that
+ * have a non-standard calling sequence on the Linux/PPC
+ * platform.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/sys.h>
+#include <linux/ipc.h>
+#include <linux/utsname.h>
+#include <linux/file.h>
+#include <linux/personality.h>
+
+#include <linux/uaccess.h>
+#include <asm/syscalls.h>
+#include <asm/time.h>
+#include <asm/unistd.h>
+#include <asm/asm-prototypes.h>
+
+static inline long do_mmap2(unsigned long addr, size_t len,
+			unsigned long prot, unsigned long flags,
+			unsigned long fd, unsigned long off, int shift)
+{
+	long ret = -EINVAL;
+
+	if (!arch_validate_prot(prot, addr))
+		goto out;
+
+	if (shift) {
+		if (off & ((1 << shift) - 1))
+			goto out;
+		off >>= shift;
+	}
+
+	ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off);
+out:
+	return ret;
+}
+
+SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, unsigned long, pgoff)
+{
+	return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12);
+}
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, off_t, offset)
+{
+	return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
+}
+
+#ifdef CONFIG_PPC32
+/*
+ * Due to some executables calling the wrong select we sometimes
+ * get wrong args.  This determines how the args are being passed
+ * (a single ptr to them all args passed) then calls
+ * sys_select() with the appropriate args. -- Cort
+ */
+int
+ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
+{
+	if ( (unsigned long)n >= 4096 )
+	{
+		unsigned long __user *buffer = (unsigned long __user *)n;
+		if (!access_ok(VERIFY_READ, buffer, 5*sizeof(unsigned long))
+		    || __get_user(n, buffer)
+		    || __get_user(inp, ((fd_set __user * __user *)(buffer+1)))
+		    || __get_user(outp, ((fd_set  __user * __user *)(buffer+2)))
+		    || __get_user(exp, ((fd_set  __user * __user *)(buffer+3)))
+		    || __get_user(tvp, ((struct timeval  __user * __user *)(buffer+4))))
+			return -EFAULT;
+	}
+	return sys_select(n, inp, outp, exp, tvp);
+}
+#endif
+
+#ifdef CONFIG_PPC64
+long ppc64_personality(unsigned long personality)
+{
+	long ret;
+
+	if (personality(current->personality) == PER_LINUX32
+	    && personality(personality) == PER_LINUX)
+		personality = (personality & ~PER_MASK) | PER_LINUX32;
+	ret = sys_personality(personality);
+	if (personality(ret) == PER_LINUX32)
+		ret = (ret & ~PER_MASK) | PER_LINUX;
+	return ret;
+}
+#endif
+
+long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
+		      u32 len_high, u32 len_low)
+{
+	return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low,
+				 (u64)len_high << 32 | len_low, advice);
+}
+
+long sys_switch_endian(void)
+{
+	struct thread_info *ti;
+
+	current->thread.regs->msr ^= MSR_LE;
+
+	/*
+	 * Set TIF_RESTOREALL so that r3 isn't clobbered on return to
+	 * userspace. That also has the effect of restoring the non-volatile
+	 * GPRs, so we saved them on the way in here.
+	 */
+	ti = current_thread_info();
+	ti->flags |= _TIF_RESTOREALL;
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
new file mode 100644
index 000000000..6b107de10
--- /dev/null
+++ b/arch/powerpc/kernel/sysfs.c
@@ -0,0 +1,1054 @@
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/nodemask.h>
+#include <linux/cpumask.h>
+#include <linux/notifier.h>
+
+#include <asm/current.h>
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/hvcall.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/smp.h>
+#include <asm/pmc.h>
+#include <asm/firmware.h>
+
+#include "cacheinfo.h"
+#include "setup.h"
+
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#include <asm/lppaca.h>
+#endif
+
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
+
+#ifdef CONFIG_PPC64
+
+/*
+ * Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle:
+ * smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in
+ * 2014:
+ *
+ *  "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean
+ *  up the kernel code."
+ *
+ * powerpc-utils stopped using it as of 1.3.8. At some point in the future this
+ * code should be removed.
+ */
+
+static ssize_t store_smt_snooze_delay(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf,
+				      size_t count)
+{
+	pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n",
+		     current->comm, current->pid);
+	return count;
+}
+
+static ssize_t show_smt_snooze_delay(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n",
+		     current->comm, current->pid);
+	return sprintf(buf, "100\n");
+}
+
+static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
+		   store_smt_snooze_delay);
+
+static int __init setup_smt_snooze_delay(char *str)
+{
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return 1;
+
+	pr_warn("smt-snooze-delay command line option has no effect\n");
+	return 1;
+}
+__setup("smt-snooze-delay=", setup_smt_snooze_delay);
+
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define MAX_BIT				63
+
+static u64 pw20_wt;
+static u64 altivec_idle_wt;
+
+static unsigned int get_idle_ticks_bit(u64 ns)
+{
+	u64 cycle;
+
+	if (ns >= 10000)
+		cycle = div_u64(ns + 500, 1000) * tb_ticks_per_usec;
+	else
+		cycle = div_u64(ns * tb_ticks_per_usec, 1000);
+
+	if (!cycle)
+		return 0;
+
+	return ilog2(cycle);
+}
+
+static void do_show_pwrmgtcr0(void *val)
+{
+	u32 *value = val;
+
+	*value = mfspr(SPRN_PWRMGTCR0);
+}
+
+static ssize_t show_pw20_state(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+
+	value &= PWRMGTCR0_PW20_WAIT;
+
+	return sprintf(buf, "%u\n", value ? 1 : 0);
+}
+
+static void do_store_pw20_state(void *val)
+{
+	u32 *value = val;
+	u32 pw20_state;
+
+	pw20_state = mfspr(SPRN_PWRMGTCR0);
+
+	if (*value)
+		pw20_state |= PWRMGTCR0_PW20_WAIT;
+	else
+		pw20_state &= ~PWRMGTCR0_PW20_WAIT;
+
+	mtspr(SPRN_PWRMGTCR0, pw20_state);
+}
+
+static ssize_t store_pw20_state(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	if (kstrtou32(buf, 0, &value))
+		return -EINVAL;
+
+	if (value > 1)
+		return -EINVAL;
+
+	smp_call_function_single(cpu, do_store_pw20_state, &value, 1);
+
+	return count;
+}
+
+static ssize_t show_pw20_wait_time(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	u64 tb_cycle = 1;
+	u64 time;
+
+	unsigned int cpu = dev->id;
+
+	if (!pw20_wt) {
+		smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+		value = (value & PWRMGTCR0_PW20_ENT) >>
+					PWRMGTCR0_PW20_ENT_SHIFT;
+
+		tb_cycle = (tb_cycle << (MAX_BIT - value + 1));
+		/* convert ms to ns */
+		if (tb_ticks_per_usec > 1000) {
+			time = div_u64(tb_cycle, tb_ticks_per_usec / 1000);
+		} else {
+			u32 rem_us;
+
+			time = div_u64_rem(tb_cycle, tb_ticks_per_usec,
+						&rem_us);
+			time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec;
+		}
+	} else {
+		time = pw20_wt;
+	}
+
+	return sprintf(buf, "%llu\n", time > 0 ? time : 0);
+}
+
+static void set_pw20_wait_entry_bit(void *val)
+{
+	u32 *value = val;
+	u32 pw20_idle;
+
+	pw20_idle = mfspr(SPRN_PWRMGTCR0);
+
+	/* Set Automatic PW20 Core Idle Count */
+	/* clear count */
+	pw20_idle &= ~PWRMGTCR0_PW20_ENT;
+
+	/* set count */
+	pw20_idle |= ((MAX_BIT - *value) << PWRMGTCR0_PW20_ENT_SHIFT);
+
+	mtspr(SPRN_PWRMGTCR0, pw20_idle);
+}
+
+static ssize_t store_pw20_wait_time(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 entry_bit;
+	u64 value;
+
+	unsigned int cpu = dev->id;
+
+	if (kstrtou64(buf, 0, &value))
+		return -EINVAL;
+
+	if (!value)
+		return -EINVAL;
+
+	entry_bit = get_idle_ticks_bit(value);
+	if (entry_bit > MAX_BIT)
+		return -EINVAL;
+
+	pw20_wt = value;
+
+	smp_call_function_single(cpu, set_pw20_wait_entry_bit,
+				&entry_bit, 1);
+
+	return count;
+}
+
+static ssize_t show_altivec_idle(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+
+	value &= PWRMGTCR0_AV_IDLE_PD_EN;
+
+	return sprintf(buf, "%u\n", value ? 1 : 0);
+}
+
+static void do_store_altivec_idle(void *val)
+{
+	u32 *value = val;
+	u32 altivec_idle;
+
+	altivec_idle = mfspr(SPRN_PWRMGTCR0);
+
+	if (*value)
+		altivec_idle |= PWRMGTCR0_AV_IDLE_PD_EN;
+	else
+		altivec_idle &= ~PWRMGTCR0_AV_IDLE_PD_EN;
+
+	mtspr(SPRN_PWRMGTCR0, altivec_idle);
+}
+
+static ssize_t store_altivec_idle(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 value;
+	unsigned int cpu = dev->id;
+
+	if (kstrtou32(buf, 0, &value))
+		return -EINVAL;
+
+	if (value > 1)
+		return -EINVAL;
+
+	smp_call_function_single(cpu, do_store_altivec_idle, &value, 1);
+
+	return count;
+}
+
+static ssize_t show_altivec_idle_wait_time(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	u32 value;
+	u64 tb_cycle = 1;
+	u64 time;
+
+	unsigned int cpu = dev->id;
+
+	if (!altivec_idle_wt) {
+		smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+		value = (value & PWRMGTCR0_AV_IDLE_CNT) >>
+					PWRMGTCR0_AV_IDLE_CNT_SHIFT;
+
+		tb_cycle = (tb_cycle << (MAX_BIT - value + 1));
+		/* convert ms to ns */
+		if (tb_ticks_per_usec > 1000) {
+			time = div_u64(tb_cycle, tb_ticks_per_usec / 1000);
+		} else {
+			u32 rem_us;
+
+			time = div_u64_rem(tb_cycle, tb_ticks_per_usec,
+						&rem_us);
+			time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec;
+		}
+	} else {
+		time = altivec_idle_wt;
+	}
+
+	return sprintf(buf, "%llu\n", time > 0 ? time : 0);
+}
+
+static void set_altivec_idle_wait_entry_bit(void *val)
+{
+	u32 *value = val;
+	u32 altivec_idle;
+
+	altivec_idle = mfspr(SPRN_PWRMGTCR0);
+
+	/* Set Automatic AltiVec Idle Count */
+	/* clear count */
+	altivec_idle &= ~PWRMGTCR0_AV_IDLE_CNT;
+
+	/* set count */
+	altivec_idle |= ((MAX_BIT - *value) << PWRMGTCR0_AV_IDLE_CNT_SHIFT);
+
+	mtspr(SPRN_PWRMGTCR0, altivec_idle);
+}
+
+static ssize_t store_altivec_idle_wait_time(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	u32 entry_bit;
+	u64 value;
+
+	unsigned int cpu = dev->id;
+
+	if (kstrtou64(buf, 0, &value))
+		return -EINVAL;
+
+	if (!value)
+		return -EINVAL;
+
+	entry_bit = get_idle_ticks_bit(value);
+	if (entry_bit > MAX_BIT)
+		return -EINVAL;
+
+	altivec_idle_wt = value;
+
+	smp_call_function_single(cpu, set_altivec_idle_wait_entry_bit,
+				&entry_bit, 1);
+
+	return count;
+}
+
+/*
+ * Enable/Disable interface:
+ * 0, disable. 1, enable.
+ */
+static DEVICE_ATTR(pw20_state, 0600, show_pw20_state, store_pw20_state);
+static DEVICE_ATTR(altivec_idle, 0600, show_altivec_idle, store_altivec_idle);
+
+/*
+ * Set wait time interface:(Nanosecond)
+ * Example: Base on TBfreq is 41MHZ.
+ * 1~48(ns): TB[63]
+ * 49~97(ns): TB[62]
+ * 98~195(ns): TB[61]
+ * 196~390(ns): TB[60]
+ * 391~780(ns): TB[59]
+ * 781~1560(ns): TB[58]
+ * ...
+ */
+static DEVICE_ATTR(pw20_wait_time, 0600,
+			show_pw20_wait_time,
+			store_pw20_wait_time);
+static DEVICE_ATTR(altivec_idle_wait_time, 0600,
+			show_altivec_idle_wait_time,
+			store_altivec_idle_wait_time);
+#endif
+
+/*
+ * Enabling PMCs will slow partition context switch times so we only do
+ * it the first time we write to the PMCs.
+ */
+
+static DEFINE_PER_CPU(char, pmcs_enabled);
+
+void ppc_enable_pmcs(void)
+{
+	ppc_set_pmu_inuse(1);
+
+	/* Only need to enable them once */
+	if (__this_cpu_read(pmcs_enabled))
+		return;
+
+	__this_cpu_write(pmcs_enabled, 1);
+
+	if (ppc_md.enable_pmcs)
+		ppc_md.enable_pmcs();
+}
+EXPORT_SYMBOL(ppc_enable_pmcs);
+
+#define __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, EXTRA) \
+static void read_##NAME(void *val) \
+{ \
+	*(unsigned long *)val = mfspr(ADDRESS);	\
+} \
+static void write_##NAME(void *val) \
+{ \
+	EXTRA; \
+	mtspr(ADDRESS, *(unsigned long *)val);	\
+}
+
+#define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \
+static ssize_t show_##NAME(struct device *dev, \
+			struct device_attribute *attr, \
+			char *buf) \
+{ \
+	struct cpu *cpu = container_of(dev, struct cpu, dev); \
+	unsigned long val; \
+	smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1);	\
+	return sprintf(buf, "%lx\n", val); \
+} \
+static ssize_t __used \
+	store_##NAME(struct device *dev, struct device_attribute *attr, \
+			const char *buf, size_t count) \
+{ \
+	struct cpu *cpu = container_of(dev, struct cpu, dev); \
+	unsigned long val; \
+	int ret = sscanf(buf, "%lx", &val); \
+	if (ret != 1) \
+		return -EINVAL; \
+	smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \
+	return count; \
+}
+
+#define SYSFS_PMCSETUP(NAME, ADDRESS) \
+	__SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \
+	__SYSFS_SPRSETUP_SHOW_STORE(NAME)
+#define SYSFS_SPRSETUP(NAME, ADDRESS) \
+	__SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \
+	__SYSFS_SPRSETUP_SHOW_STORE(NAME)
+
+#define SYSFS_SPRSETUP_SHOW_STORE(NAME) \
+	__SYSFS_SPRSETUP_SHOW_STORE(NAME)
+
+/* Let's define all possible registers, we'll only hook up the ones
+ * that are implemented on the current processor
+ */
+
+#if defined(CONFIG_PPC64)
+#define HAS_PPC_PMC_CLASSIC	1
+#define HAS_PPC_PMC_IBM		1
+#define HAS_PPC_PMC_PA6T	1
+#elif defined(CONFIG_6xx)
+#define HAS_PPC_PMC_CLASSIC	1
+#define HAS_PPC_PMC_IBM		1
+#define HAS_PPC_PMC_G4		1
+#endif
+
+
+#ifdef HAS_PPC_PMC_CLASSIC
+SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0);
+SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1);
+SYSFS_PMCSETUP(pmc1, SPRN_PMC1);
+SYSFS_PMCSETUP(pmc2, SPRN_PMC2);
+SYSFS_PMCSETUP(pmc3, SPRN_PMC3);
+SYSFS_PMCSETUP(pmc4, SPRN_PMC4);
+SYSFS_PMCSETUP(pmc5, SPRN_PMC5);
+SYSFS_PMCSETUP(pmc6, SPRN_PMC6);
+
+#ifdef HAS_PPC_PMC_G4
+SYSFS_PMCSETUP(mmcr2, SPRN_MMCR2);
+#endif
+
+#ifdef CONFIG_PPC64
+SYSFS_PMCSETUP(pmc7, SPRN_PMC7);
+SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
+
+SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
+SYSFS_SPRSETUP(purr, SPRN_PURR);
+SYSFS_SPRSETUP(spurr, SPRN_SPURR);
+SYSFS_SPRSETUP(pir, SPRN_PIR);
+SYSFS_SPRSETUP(tscr, SPRN_TSCR);
+
+/*
+  Lets only enable read for phyp resources and
+  enable write when needed with a separate function.
+  Lets be conservative and default to pseries.
+*/
+static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
+static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
+static DEVICE_ATTR(pir, 0400, show_pir, NULL);
+static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr);
+
+/*
+ * This is the system wide DSCR register default value. Any
+ * change to this default value through the sysfs interface
+ * will update all per cpu DSCR default values across the
+ * system stored in their respective PACA structures.
+ */
+static unsigned long dscr_default;
+
+/**
+ * read_dscr() - Fetch the cpu specific DSCR default
+ * @val:	Returned cpu specific DSCR default value
+ *
+ * This function returns the per cpu DSCR default value
+ * for any cpu which is contained in it's PACA structure.
+ */
+static void read_dscr(void *val)
+{
+	*(unsigned long *)val = get_paca()->dscr_default;
+}
+
+
+/**
+ * write_dscr() - Update the cpu specific DSCR default
+ * @val:	New cpu specific DSCR default value to update
+ *
+ * This function updates the per cpu DSCR default value
+ * for any cpu which is contained in it's PACA structure.
+ */
+static void write_dscr(void *val)
+{
+	get_paca()->dscr_default = *(unsigned long *)val;
+	if (!current->thread.dscr_inherit) {
+		current->thread.dscr = *(unsigned long *)val;
+		mtspr(SPRN_DSCR, *(unsigned long *)val);
+	}
+}
+
+SYSFS_SPRSETUP_SHOW_STORE(dscr);
+static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr);
+
+static void add_write_permission_dev_attr(struct device_attribute *attr)
+{
+	attr->attr.mode |= 0200;
+}
+
+/**
+ * show_dscr_default() - Fetch the system wide DSCR default
+ * @dev:	Device structure
+ * @attr:	Device attribute structure
+ * @buf:	Interface buffer
+ *
+ * This function returns the system wide DSCR default value.
+ */
+static ssize_t show_dscr_default(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lx\n", dscr_default);
+}
+
+/**
+ * store_dscr_default() - Update the system wide DSCR default
+ * @dev:	Device structure
+ * @attr:	Device attribute structure
+ * @buf:	Interface buffer
+ * @count:	Size of the update
+ *
+ * This function updates the system wide DSCR default value.
+ */
+static ssize_t __used store_dscr_default(struct device *dev,
+		struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	unsigned long val;
+	int ret = 0;
+	
+	ret = sscanf(buf, "%lx", &val);
+	if (ret != 1)
+		return -EINVAL;
+	dscr_default = val;
+
+	on_each_cpu(write_dscr, &val, 1);
+
+	return count;
+}
+
+static DEVICE_ATTR(dscr_default, 0600,
+		show_dscr_default, store_dscr_default);
+
+static void sysfs_create_dscr_default(void)
+{
+	if (cpu_has_feature(CPU_FTR_DSCR)) {
+		int err = 0;
+		int cpu;
+
+		dscr_default = spr_default_dscr;
+		for_each_possible_cpu(cpu)
+			paca_ptrs[cpu]->dscr_default = dscr_default;
+
+		err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
+	}
+}
+
+#endif /* CONFIG_PPC64 */
+
+#ifdef HAS_PPC_PMC_PA6T
+SYSFS_PMCSETUP(pa6t_pmc0, SPRN_PA6T_PMC0);
+SYSFS_PMCSETUP(pa6t_pmc1, SPRN_PA6T_PMC1);
+SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2);
+SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
+SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
+SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
+#ifdef CONFIG_DEBUG_KERNEL
+SYSFS_SPRSETUP(hid0, SPRN_HID0);
+SYSFS_SPRSETUP(hid1, SPRN_HID1);
+SYSFS_SPRSETUP(hid4, SPRN_HID4);
+SYSFS_SPRSETUP(hid5, SPRN_HID5);
+SYSFS_SPRSETUP(ima0, SPRN_PA6T_IMA0);
+SYSFS_SPRSETUP(ima1, SPRN_PA6T_IMA1);
+SYSFS_SPRSETUP(ima2, SPRN_PA6T_IMA2);
+SYSFS_SPRSETUP(ima3, SPRN_PA6T_IMA3);
+SYSFS_SPRSETUP(ima4, SPRN_PA6T_IMA4);
+SYSFS_SPRSETUP(ima5, SPRN_PA6T_IMA5);
+SYSFS_SPRSETUP(ima6, SPRN_PA6T_IMA6);
+SYSFS_SPRSETUP(ima7, SPRN_PA6T_IMA7);
+SYSFS_SPRSETUP(ima8, SPRN_PA6T_IMA8);
+SYSFS_SPRSETUP(ima9, SPRN_PA6T_IMA9);
+SYSFS_SPRSETUP(imaat, SPRN_PA6T_IMAAT);
+SYSFS_SPRSETUP(btcr, SPRN_PA6T_BTCR);
+SYSFS_SPRSETUP(pccr, SPRN_PA6T_PCCR);
+SYSFS_SPRSETUP(rpccr, SPRN_PA6T_RPCCR);
+SYSFS_SPRSETUP(der, SPRN_PA6T_DER);
+SYSFS_SPRSETUP(mer, SPRN_PA6T_MER);
+SYSFS_SPRSETUP(ber, SPRN_PA6T_BER);
+SYSFS_SPRSETUP(ier, SPRN_PA6T_IER);
+SYSFS_SPRSETUP(sier, SPRN_PA6T_SIER);
+SYSFS_SPRSETUP(siar, SPRN_PA6T_SIAR);
+SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0);
+SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1);
+SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2);
+SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
+#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* HAS_PPC_PMC_PA6T */
+
+#ifdef HAS_PPC_PMC_IBM
+static struct device_attribute ibm_common_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+};
+#endif /* HAS_PPC_PMC_G4 */
+
+#ifdef HAS_PPC_PMC_G4
+static struct device_attribute g4_common_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+	__ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
+};
+#endif /* HAS_PPC_PMC_G4 */
+
+static struct device_attribute classic_pmc_attrs[] = {
+	__ATTR(pmc1, 0600, show_pmc1, store_pmc1),
+	__ATTR(pmc2, 0600, show_pmc2, store_pmc2),
+	__ATTR(pmc3, 0600, show_pmc3, store_pmc3),
+	__ATTR(pmc4, 0600, show_pmc4, store_pmc4),
+	__ATTR(pmc5, 0600, show_pmc5, store_pmc5),
+	__ATTR(pmc6, 0600, show_pmc6, store_pmc6),
+#ifdef CONFIG_PPC64
+	__ATTR(pmc7, 0600, show_pmc7, store_pmc7),
+	__ATTR(pmc8, 0600, show_pmc8, store_pmc8),
+#endif
+};
+
+#ifdef HAS_PPC_PMC_PA6T
+static struct device_attribute pa6t_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+	__ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
+	__ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
+	__ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
+	__ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
+	__ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
+	__ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
+#ifdef CONFIG_DEBUG_KERNEL
+	__ATTR(hid0, 0600, show_hid0, store_hid0),
+	__ATTR(hid1, 0600, show_hid1, store_hid1),
+	__ATTR(hid4, 0600, show_hid4, store_hid4),
+	__ATTR(hid5, 0600, show_hid5, store_hid5),
+	__ATTR(ima0, 0600, show_ima0, store_ima0),
+	__ATTR(ima1, 0600, show_ima1, store_ima1),
+	__ATTR(ima2, 0600, show_ima2, store_ima2),
+	__ATTR(ima3, 0600, show_ima3, store_ima3),
+	__ATTR(ima4, 0600, show_ima4, store_ima4),
+	__ATTR(ima5, 0600, show_ima5, store_ima5),
+	__ATTR(ima6, 0600, show_ima6, store_ima6),
+	__ATTR(ima7, 0600, show_ima7, store_ima7),
+	__ATTR(ima8, 0600, show_ima8, store_ima8),
+	__ATTR(ima9, 0600, show_ima9, store_ima9),
+	__ATTR(imaat, 0600, show_imaat, store_imaat),
+	__ATTR(btcr, 0600, show_btcr, store_btcr),
+	__ATTR(pccr, 0600, show_pccr, store_pccr),
+	__ATTR(rpccr, 0600, show_rpccr, store_rpccr),
+	__ATTR(der, 0600, show_der, store_der),
+	__ATTR(mer, 0600, show_mer, store_mer),
+	__ATTR(ber, 0600, show_ber, store_ber),
+	__ATTR(ier, 0600, show_ier, store_ier),
+	__ATTR(sier, 0600, show_sier, store_sier),
+	__ATTR(siar, 0600, show_siar, store_siar),
+	__ATTR(tsr0, 0600, show_tsr0, store_tsr0),
+	__ATTR(tsr1, 0600, show_tsr1, store_tsr1),
+	__ATTR(tsr2, 0600, show_tsr2, store_tsr2),
+	__ATTR(tsr3, 0600, show_tsr3, store_tsr3),
+#endif /* CONFIG_DEBUG_KERNEL */
+};
+#endif /* HAS_PPC_PMC_PA6T */
+#endif /* HAS_PPC_PMC_CLASSIC */
+
+static int register_cpu_online(unsigned int cpu)
+{
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct device *s = &c->dev;
+	struct device_attribute *attrs, *pmc_attrs;
+	int i, nattrs;
+
+	/* For cpus present at boot a reference was already grabbed in register_cpu() */
+	if (!s->of_node)
+		s->of_node = of_get_cpu_node(cpu, NULL);
+
+#ifdef CONFIG_PPC64
+	if (cpu_has_feature(CPU_FTR_SMT))
+		device_create_file(s, &dev_attr_smt_snooze_delay);
+#endif
+
+	/* PMC stuff */
+	switch (cur_cpu_spec->pmc_type) {
+#ifdef HAS_PPC_PMC_IBM
+	case PPC_PMC_IBM:
+		attrs = ibm_common_attrs;
+		nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
+		pmc_attrs = classic_pmc_attrs;
+		break;
+#endif /* HAS_PPC_PMC_IBM */
+#ifdef HAS_PPC_PMC_G4
+	case PPC_PMC_G4:
+		attrs = g4_common_attrs;
+		nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
+		pmc_attrs = classic_pmc_attrs;
+		break;
+#endif /* HAS_PPC_PMC_G4 */
+#ifdef HAS_PPC_PMC_PA6T
+	case PPC_PMC_PA6T:
+		/* PA Semi starts counting at PMC0 */
+		attrs = pa6t_attrs;
+		nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
+		pmc_attrs = NULL;
+		break;
+#endif /* HAS_PPC_PMC_PA6T */
+	default:
+		attrs = NULL;
+		nattrs = 0;
+		pmc_attrs = NULL;
+	}
+
+	for (i = 0; i < nattrs; i++)
+		device_create_file(s, &attrs[i]);
+
+	if (pmc_attrs)
+		for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
+			device_create_file(s, &pmc_attrs[i]);
+
+#ifdef CONFIG_PPC64
+	if (cpu_has_feature(CPU_FTR_MMCRA))
+		device_create_file(s, &dev_attr_mmcra);
+
+	if (cpu_has_feature(CPU_FTR_PURR)) {
+		if (!firmware_has_feature(FW_FEATURE_LPAR))
+			add_write_permission_dev_attr(&dev_attr_purr);
+		device_create_file(s, &dev_attr_purr);
+	}
+
+	if (cpu_has_feature(CPU_FTR_SPURR))
+		device_create_file(s, &dev_attr_spurr);
+
+	if (cpu_has_feature(CPU_FTR_DSCR))
+		device_create_file(s, &dev_attr_dscr);
+
+	if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
+		device_create_file(s, &dev_attr_pir);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		!firmware_has_feature(FW_FEATURE_LPAR))
+		device_create_file(s, &dev_attr_tscr);
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) {
+		device_create_file(s, &dev_attr_pw20_state);
+		device_create_file(s, &dev_attr_pw20_wait_time);
+
+		device_create_file(s, &dev_attr_altivec_idle);
+		device_create_file(s, &dev_attr_altivec_idle_wait_time);
+	}
+#endif
+	cacheinfo_cpu_online(cpu);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int unregister_cpu_online(unsigned int cpu)
+{
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct device *s = &c->dev;
+	struct device_attribute *attrs, *pmc_attrs;
+	int i, nattrs;
+
+	BUG_ON(!c->hotpluggable);
+
+#ifdef CONFIG_PPC64
+	if (cpu_has_feature(CPU_FTR_SMT))
+		device_remove_file(s, &dev_attr_smt_snooze_delay);
+#endif
+
+	/* PMC stuff */
+	switch (cur_cpu_spec->pmc_type) {
+#ifdef HAS_PPC_PMC_IBM
+	case PPC_PMC_IBM:
+		attrs = ibm_common_attrs;
+		nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
+		pmc_attrs = classic_pmc_attrs;
+		break;
+#endif /* HAS_PPC_PMC_IBM */
+#ifdef HAS_PPC_PMC_G4
+	case PPC_PMC_G4:
+		attrs = g4_common_attrs;
+		nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
+		pmc_attrs = classic_pmc_attrs;
+		break;
+#endif /* HAS_PPC_PMC_G4 */
+#ifdef HAS_PPC_PMC_PA6T
+	case PPC_PMC_PA6T:
+		/* PA Semi starts counting at PMC0 */
+		attrs = pa6t_attrs;
+		nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
+		pmc_attrs = NULL;
+		break;
+#endif /* HAS_PPC_PMC_PA6T */
+	default:
+		attrs = NULL;
+		nattrs = 0;
+		pmc_attrs = NULL;
+	}
+
+	for (i = 0; i < nattrs; i++)
+		device_remove_file(s, &attrs[i]);
+
+	if (pmc_attrs)
+		for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
+			device_remove_file(s, &pmc_attrs[i]);
+
+#ifdef CONFIG_PPC64
+	if (cpu_has_feature(CPU_FTR_MMCRA))
+		device_remove_file(s, &dev_attr_mmcra);
+
+	if (cpu_has_feature(CPU_FTR_PURR))
+		device_remove_file(s, &dev_attr_purr);
+
+	if (cpu_has_feature(CPU_FTR_SPURR))
+		device_remove_file(s, &dev_attr_spurr);
+
+	if (cpu_has_feature(CPU_FTR_DSCR))
+		device_remove_file(s, &dev_attr_dscr);
+
+	if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
+		device_remove_file(s, &dev_attr_pir);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		!firmware_has_feature(FW_FEATURE_LPAR))
+		device_remove_file(s, &dev_attr_tscr);
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) {
+		device_remove_file(s, &dev_attr_pw20_state);
+		device_remove_file(s, &dev_attr_pw20_wait_time);
+
+		device_remove_file(s, &dev_attr_altivec_idle);
+		device_remove_file(s, &dev_attr_altivec_idle_wait_time);
+	}
+#endif
+	cacheinfo_cpu_offline(cpu);
+	of_node_put(s->of_node);
+	s->of_node = NULL;
+	return 0;
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+#define unregister_cpu_online NULL
+#endif
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+ssize_t arch_cpu_probe(const char *buf, size_t count)
+{
+	if (ppc_md.cpu_probe)
+		return ppc_md.cpu_probe(buf, count);
+
+	return -EINVAL;
+}
+
+ssize_t arch_cpu_release(const char *buf, size_t count)
+{
+	if (ppc_md.cpu_release)
+		return ppc_md.cpu_release(buf, count);
+
+	return -EINVAL;
+}
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
+
+static DEFINE_MUTEX(cpu_mutex);
+
+int cpu_add_dev_attr(struct device_attribute *attr)
+{
+	int cpu;
+
+	mutex_lock(&cpu_mutex);
+
+	for_each_possible_cpu(cpu) {
+		device_create_file(get_cpu_device(cpu), attr);
+	}
+
+	mutex_unlock(&cpu_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr);
+
+int cpu_add_dev_attr_group(struct attribute_group *attrs)
+{
+	int cpu;
+	struct device *dev;
+	int ret;
+
+	mutex_lock(&cpu_mutex);
+
+	for_each_possible_cpu(cpu) {
+		dev = get_cpu_device(cpu);
+		ret = sysfs_create_group(&dev->kobj, attrs);
+		WARN_ON(ret != 0);
+	}
+
+	mutex_unlock(&cpu_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr_group);
+
+
+void cpu_remove_dev_attr(struct device_attribute *attr)
+{
+	int cpu;
+
+	mutex_lock(&cpu_mutex);
+
+	for_each_possible_cpu(cpu) {
+		device_remove_file(get_cpu_device(cpu), attr);
+	}
+
+	mutex_unlock(&cpu_mutex);
+}
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr);
+
+void cpu_remove_dev_attr_group(struct attribute_group *attrs)
+{
+	int cpu;
+	struct device *dev;
+
+	mutex_lock(&cpu_mutex);
+
+	for_each_possible_cpu(cpu) {
+		dev = get_cpu_device(cpu);
+		sysfs_remove_group(&dev->kobj, attrs);
+	}
+
+	mutex_unlock(&cpu_mutex);
+}
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group);
+
+
+/* NUMA stuff */
+
+#ifdef CONFIG_NUMA
+static void register_nodes(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		register_one_node(i);
+}
+
+int sysfs_add_device_to_node(struct device *dev, int nid)
+{
+	struct node *node = node_devices[nid];
+	return sysfs_create_link(&node->dev.kobj, &dev->kobj,
+			kobject_name(&dev->kobj));
+}
+EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
+
+void sysfs_remove_device_from_node(struct device *dev, int nid)
+{
+	struct node *node = node_devices[nid];
+	sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
+
+#else
+static void register_nodes(void)
+{
+	return;
+}
+
+#endif
+
+/* Only valid if CPU is present. */
+static ssize_t show_physical_id(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
+
+	return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->dev.id));
+}
+static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL);
+
+static int __init topology_init(void)
+{
+	int cpu, r;
+
+	register_nodes();
+
+	for_each_possible_cpu(cpu) {
+		struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+		/*
+		 * For now, we just see if the system supports making
+		 * the RTAS calls for CPU hotplug.  But, there may be a
+		 * more comprehensive way to do this for an individual
+		 * CPU.  For instance, the boot cpu might never be valid
+		 * for hotplugging.
+		 */
+		if (ppc_md.cpu_die)
+			c->hotpluggable = 1;
+
+		if (cpu_online(cpu) || c->hotpluggable) {
+			register_cpu(c, cpu);
+
+			device_create_file(&c->dev, &dev_attr_physical_id);
+		}
+	}
+	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/topology:online",
+			      register_cpu_online, unregister_cpu_online);
+	WARN_ON(r < 0);
+#ifdef CONFIG_PPC64
+	sysfs_create_dscr_default();
+#endif /* CONFIG_PPC64 */
+
+	return 0;
+}
+subsys_initcall(topology_init);
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
new file mode 100644
index 000000000..919a32746
--- /dev/null
+++ b/arch/powerpc/kernel/systbl.S
@@ -0,0 +1,50 @@
+/*
+ * This file contains the table of syscall-handling functions.
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ *
+ * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com)
+ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) 
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+
+#ifdef CONFIG_PPC64
+#define SYSCALL(func)		.8byte	DOTSYM(sys_##func),DOTSYM(sys_##func)
+#define COMPAT_SYS(func)	.8byte	DOTSYM(sys_##func),DOTSYM(compat_sys_##func)
+#define PPC_SYS(func)		.8byte	DOTSYM(ppc_##func),DOTSYM(ppc_##func)
+#define OLDSYS(func)		.8byte	DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall)
+#define SYS32ONLY(func)		.8byte	DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func)
+#define PPC64ONLY(func)		.8byte	DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall)
+#define SYSX(f, f3264, f32)	.8byte	DOTSYM(f),DOTSYM(f3264)
+#else
+#define SYSCALL(func)		.long	sys_##func
+#define COMPAT_SYS(func)	.long	sys_##func
+#define PPC_SYS(func)		.long	ppc_##func
+#define OLDSYS(func)		.long	sys_##func
+#define SYS32ONLY(func)		.long	sys_##func
+#define PPC64ONLY(func)		.long	sys_ni_syscall
+#define SYSX(f, f3264, f32)	.long	f32
+#endif
+#define SYSCALL_SPU(func)	SYSCALL(func)
+#define COMPAT_SYS_SPU(func)	COMPAT_SYS(func)
+#define COMPAT_SPU_NEW(func)	COMPAT_SYS(func)
+#define SYSX_SPU(f, f3264, f32)	SYSX(f, f3264, f32)
+
+.section .rodata,"a"
+
+#ifdef CONFIG_PPC64
+	.p2align	3
+#endif
+
+.globl sys_call_table
+sys_call_table:
+
+#include <asm/systbl.h>
diff --git a/arch/powerpc/kernel/systbl_chk.c b/arch/powerpc/kernel/systbl_chk.c
new file mode 100644
index 000000000..465325872
--- /dev/null
+++ b/arch/powerpc/kernel/systbl_chk.c
@@ -0,0 +1,60 @@
+/*
+ * This file, when run through CPP produces a list of syscall numbers
+ * in the order of systbl.h.  That way we can check for gaps and syscalls
+ * that are out of order.
+ *
+ * Unfortunately, we cannot check for the correct ordering of entries
+ * using SYSX().
+ *
+ * Copyright © IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/unistd.h>
+
+#define SYSCALL(func)		__NR_##func
+#define COMPAT_SYS(func)	__NR_##func
+#define PPC_SYS(func)		__NR_##func
+#ifdef CONFIG_PPC64
+#define OLDSYS(func)		-1
+#define SYS32ONLY(func)		-1
+#define PPC64ONLY(func)		__NR_##func
+#else
+#define OLDSYS(func)		__NR_old##func
+#define SYS32ONLY(func)		__NR_##func
+#define PPC64ONLY(func)		-1
+#endif
+#define SYSX(f, f3264, f32)	-1
+
+#define SYSCALL_SPU(func)	SYSCALL(func)
+#define COMPAT_SYS_SPU(func)	COMPAT_SYS(func)
+#define COMPAT_SPU_NEW(func)	COMPAT_SYS(_new##func)
+#define SYSX_SPU(f, f3264, f32)	SYSX(f, f3264, f32)
+
+/* Just insert a marker for ni_syscalls */
+#define	__NR_ni_syscall		-1
+
+/*
+ * These are the known exceptions.
+ * Hopefully, there will be no more.
+ */
+#define	__NR_llseek		__NR__llseek
+#undef	__NR_umount
+#define	__NR_umount		__NR_umount2
+#define	__NR_old_getrlimit	__NR_getrlimit
+#define	__NR_newstat		__NR_stat
+#define	__NR_newlstat		__NR_lstat
+#define	__NR_newfstat		__NR_fstat
+#define	__NR_newuname		__NR_uname
+#define	__NR_sysctl		__NR__sysctl
+#define __NR_olddebug_setcontext	__NR_sys_debug_setcontext
+
+/* We call sys_ugetrlimit for syscall number __NR_getrlimit */
+#define getrlimit		ugetrlimit
+
+START_TABLE
+#include <asm/systbl.h>
+END_TABLE NR_syscalls
diff --git a/arch/powerpc/kernel/systbl_chk.sh b/arch/powerpc/kernel/systbl_chk.sh
new file mode 100644
index 000000000..f2e356c2a
--- /dev/null
+++ b/arch/powerpc/kernel/systbl_chk.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+#
+# Just process the CPP output from systbl_chk.c and complain
+# if anything is out of order.
+#
+# Copyright © 2008 IBM Corporation
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version
+# 2 of the License, or (at your option) any later version.
+
+awk	'BEGIN { num = -1; }	# Ignore the beginning of the file
+	/^#/ { next; }
+	/^[ \t]*$/ { next; }
+	/^START_TABLE/ { num = 0; next; }
+	/^END_TABLE/ {
+		if (num != $2) {
+			printf "Error: NR_syscalls (%s) is not one more than the last syscall (%s)\n",
+				$2, num - 1;
+			exit(1);
+		}
+		num = -1;	# Ignore the rest of the file
+	}
+	{
+		if (num == -1) next;
+		if (($1 != -1) && ($1 != num)) {
+			printf "Error: Syscall %s out of order (expected %s)\n",
+				$1, num;
+			exit(1);
+		};
+		num++;
+	}' "$1"
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
new file mode 100644
index 000000000..8ece45c2a
--- /dev/null
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * temp.c	Thermal management for cpu's with Thermal Assist Units
+ *
+ * Written by Troy Benjegerdes <hozer@drgw.net>
+ *
+ * TODO:
+ * dynamic power management to limit peak CPU temp (using ICTC)
+ * calibration???
+ *
+ * Silly, crazy ideas: use cpu load (from scheduler) and ICTC to extend battery
+ * life in portables, and add a 'performance/watt' metric somewhere in /proc
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+
+#include <asm/io.h>
+#include <asm/reg.h>
+#include <asm/nvram.h>
+#include <asm/cache.h>
+#include <asm/8xx_immap.h>
+#include <asm/machdep.h>
+#include <asm/asm-prototypes.h>
+
+#include "setup.h"
+
+static struct tau_temp
+{
+	int interrupts;
+	unsigned char low;
+	unsigned char high;
+	unsigned char grew;
+} tau[NR_CPUS];
+
+static bool tau_int_enable;
+
+/* TODO: put these in a /proc interface, with some sanity checks, and maybe
+ * dynamic adjustment to minimize # of interrupts */
+/* configurable values for step size and how much to expand the window when
+ * we get an interrupt. These are based on the limit that was out of range */
+#define step_size		2	/* step size when temp goes out of range */
+#define window_expand		1	/* expand the window by this much */
+/* configurable values for shrinking the window */
+#define shrink_timer	2000	/* period between shrinking the window */
+#define min_window	2	/* minimum window size, degrees C */
+
+static void set_thresholds(unsigned long cpu)
+{
+	u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
+
+	/* setup THRM1, threshold, valid bit, interrupt when below threshold */
+	mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
+
+	/* setup THRM2, threshold, valid bit, interrupt when above threshold */
+	mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
+}
+
+static void TAUupdate(int cpu)
+{
+	u32 thrm;
+	u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
+
+	/* if both thresholds are crossed, the step_sizes cancel out
+	 * and the window winds up getting expanded twice. */
+	thrm = mfspr(SPRN_THRM1);
+	if ((thrm & bits) == bits) {
+		mtspr(SPRN_THRM1, 0);
+
+		if (tau[cpu].low >= step_size) {
+			tau[cpu].low -= step_size;
+			tau[cpu].high -= (step_size - window_expand);
+		}
+		tau[cpu].grew = 1;
+		pr_debug("%s: low threshold crossed\n", __func__);
+	}
+	thrm = mfspr(SPRN_THRM2);
+	if ((thrm & bits) == bits) {
+		mtspr(SPRN_THRM2, 0);
+
+		if (tau[cpu].high <= 127 - step_size) {
+			tau[cpu].low += (step_size - window_expand);
+			tau[cpu].high += step_size;
+		}
+		tau[cpu].grew = 1;
+		pr_debug("%s: high threshold crossed\n", __func__);
+	}
+}
+
+#ifdef CONFIG_TAU_INT
+/*
+ * TAU interrupts - called when we have a thermal assist unit interrupt
+ * with interrupts disabled
+ */
+
+void TAUException(struct pt_regs * regs)
+{
+	int cpu = smp_processor_id();
+
+	irq_enter();
+	tau[cpu].interrupts++;
+
+	TAUupdate(cpu);
+
+	irq_exit();
+}
+#endif /* CONFIG_TAU_INT */
+
+static void tau_timeout(void * info)
+{
+	int cpu;
+	int size;
+	int shrink;
+
+	cpu = smp_processor_id();
+
+	if (!tau_int_enable)
+		TAUupdate(cpu);
+
+	/* Stop thermal sensor comparisons and interrupts */
+	mtspr(SPRN_THRM3, 0);
+
+	size = tau[cpu].high - tau[cpu].low;
+	if (size > min_window && ! tau[cpu].grew) {
+		/* do an exponential shrink of half the amount currently over size */
+		shrink = (2 + size - min_window) / 4;
+		if (shrink) {
+			tau[cpu].low += shrink;
+			tau[cpu].high -= shrink;
+		} else { /* size must have been min_window + 1 */
+			tau[cpu].low += 1;
+#if 1 /* debug */
+			if ((tau[cpu].high - tau[cpu].low) != min_window){
+				printk(KERN_ERR "temp.c: line %d, logic error\n", __LINE__);
+			}
+#endif
+		}
+	}
+
+	tau[cpu].grew = 0;
+
+	set_thresholds(cpu);
+
+	/* Restart thermal sensor comparisons and interrupts.
+	 * The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
+	 * recommends that "the maximum value be set in THRM3 under all
+	 * conditions."
+	 */
+	mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
+}
+
+static struct workqueue_struct *tau_workq;
+
+static void tau_work_func(struct work_struct *work)
+{
+	msleep(shrink_timer);
+	on_each_cpu(tau_timeout, NULL, 0);
+	/* schedule ourselves to be run again */
+	queue_work(tau_workq, work);
+}
+
+DECLARE_WORK(tau_work, tau_work_func);
+
+/*
+ * setup the TAU
+ *
+ * Set things up to use THRM1 as a temperature lower bound, and THRM2 as an upper bound.
+ * Start off at zero
+ */
+
+int tau_initialized = 0;
+
+static void __init TAU_init_smp(void *info)
+{
+	unsigned long cpu = smp_processor_id();
+
+	/* set these to a reasonable value and let the timer shrink the
+	 * window */
+	tau[cpu].low = 5;
+	tau[cpu].high = 120;
+
+	set_thresholds(cpu);
+}
+
+static int __init TAU_init(void)
+{
+	/* We assume in SMP that if one CPU has TAU support, they
+	 * all have it --BenH
+	 */
+	if (!cpu_has_feature(CPU_FTR_TAU)) {
+		printk("Thermal assist unit not available\n");
+		tau_initialized = 0;
+		return 1;
+	}
+
+	tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
+			 !strcmp(cur_cpu_spec->platform, "ppc750");
+
+	tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
+	if (!tau_workq)
+		return -ENOMEM;
+
+	on_each_cpu(TAU_init_smp, NULL, 0);
+
+	queue_work(tau_workq, &tau_work);
+
+	pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
+		tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
+	tau_initialized = 1;
+
+	return 0;
+}
+
+__initcall(TAU_init);
+
+/*
+ * return current temp
+ */
+
+u32 cpu_temp_both(unsigned long cpu)
+{
+	return ((tau[cpu].high << 16) | tau[cpu].low);
+}
+
+u32 cpu_temp(unsigned long cpu)
+{
+	return ((tau[cpu].high + tau[cpu].low) / 2);
+}
+
+u32 tau_interrupts(unsigned long cpu)
+{
+	return (tau[cpu].interrupts);
+}
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
new file mode 100644
index 000000000..f6c21f6af
--- /dev/null
+++ b/arch/powerpc/kernel/time.c
@@ -0,0 +1,1197 @@
+/*
+ * Common time routines among all ppc machines.
+ *
+ * Written by Cort Dougan (cort@cs.nmt.edu) to merge
+ * Paul Mackerras' version and mine for PReP and Pmac.
+ * MPC8xx/MBX changes by Dan Malek (dmalek@jlc.net).
+ * Converted for 64-bit by Mike Corrigan (mikejc@us.ibm.com)
+ *
+ * First round of bugfixes by Gabriel Paubert (paubert@iram.es)
+ * to make clock more stable (2.4.0-test5). The only thing
+ * that this code assumes is that the timebases have been synchronized
+ * by firmware on SMP and are never stopped (never do sleep
+ * on SMP then, nap and doze are OK).
+ * 
+ * Speeded up do_gettimeofday by getting rid of references to
+ * xtime (which required locks for consistency). (mikejc@us.ibm.com)
+ *
+ * TODO (not necessarily in this file):
+ * - improve precision and reproducibility of timebase frequency
+ * measurement at boot time.
+ * - for astronomical applications: add a new function to get
+ * non ambiguous timestamps even around leap seconds. This needs
+ * a new timestamp format and a good name.
+ *
+ * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
+ *             "A Kernel Model for Precision Timekeeping" by Dave Mills
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/kernel_stat.h>
+#include <linux/time.h>
+#include <linux/clockchips.h>
+#include <linux/init.h>
+#include <linux/profile.h>
+#include <linux/cpu.h>
+#include <linux/security.h>
+#include <linux/percpu.h>
+#include <linux/rtc.h>
+#include <linux/jiffies.h>
+#include <linux/posix-timers.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/irq_work.h>
+#include <linux/clk-provider.h>
+#include <linux/suspend.h>
+#include <linux/rtc.h>
+#include <linux/sched/cputime.h>
+#include <linux/processor.h>
+#include <asm/trace.h>
+
+#include <asm/io.h>
+#include <asm/nvram.h>
+#include <asm/cache.h>
+#include <asm/machdep.h>
+#include <linux/uaccess.h>
+#include <asm/time.h>
+#include <asm/prom.h>
+#include <asm/irq.h>
+#include <asm/div64.h>
+#include <asm/smp.h>
+#include <asm/vdso_datapage.h>
+#include <asm/firmware.h>
+#include <asm/asm-prototypes.h>
+
+/* powerpc clocksource/clockevent code */
+
+#include <linux/clockchips.h>
+#include <linux/timekeeper_internal.h>
+
+static u64 rtc_read(struct clocksource *);
+static struct clocksource clocksource_rtc = {
+	.name         = "rtc",
+	.rating       = 400,
+	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
+	.mask         = CLOCKSOURCE_MASK(64),
+	.read         = rtc_read,
+};
+
+static u64 timebase_read(struct clocksource *);
+static struct clocksource clocksource_timebase = {
+	.name         = "timebase",
+	.rating       = 400,
+	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
+	.mask         = CLOCKSOURCE_MASK(64),
+	.read         = timebase_read,
+};
+
+#define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
+
+static int decrementer_set_next_event(unsigned long evt,
+				      struct clock_event_device *dev);
+static int decrementer_shutdown(struct clock_event_device *evt);
+
+struct clock_event_device decrementer_clockevent = {
+	.name			= "decrementer",
+	.rating			= 200,
+	.irq			= 0,
+	.set_next_event		= decrementer_set_next_event,
+	.set_state_shutdown	= decrementer_shutdown,
+	.tick_resume		= decrementer_shutdown,
+	.features		= CLOCK_EVT_FEAT_ONESHOT |
+				  CLOCK_EVT_FEAT_C3STOP,
+};
+EXPORT_SYMBOL(decrementer_clockevent);
+
+DEFINE_PER_CPU(u64, decrementers_next_tb);
+static DEFINE_PER_CPU(struct clock_event_device, decrementers);
+
+#define XSEC_PER_SEC (1024*1024)
+
+#ifdef CONFIG_PPC64
+#define SCALE_XSEC(xsec, max)	(((xsec) * max) / XSEC_PER_SEC)
+#else
+/* compute ((xsec << 12) * max) >> 32 */
+#define SCALE_XSEC(xsec, max)	mulhwu((xsec) << 12, max)
+#endif
+
+unsigned long tb_ticks_per_jiffy;
+unsigned long tb_ticks_per_usec = 100; /* sane default */
+EXPORT_SYMBOL(tb_ticks_per_usec);
+unsigned long tb_ticks_per_sec;
+EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL_GPL(rtc_lock);
+
+static u64 tb_to_ns_scale __read_mostly;
+static unsigned tb_to_ns_shift __read_mostly;
+static u64 boot_tb __read_mostly;
+
+extern struct timezone sys_tz;
+static long timezone_offset;
+
+unsigned long ppc_proc_freq;
+EXPORT_SYMBOL_GPL(ppc_proc_freq);
+unsigned long ppc_tb_freq;
+EXPORT_SYMBOL_GPL(ppc_tb_freq);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Factor for converting from cputime_t (timebase ticks) to
+ * microseconds. This is stored as 0.64 fixed-point binary fraction.
+ */
+u64 __cputime_usec_factor;
+EXPORT_SYMBOL(__cputime_usec_factor);
+
+#ifdef CONFIG_PPC_SPLPAR
+void (*dtl_consumer)(struct dtl_entry *, u64);
+#endif
+
+static void calc_cputime_factors(void)
+{
+	struct div_result res;
+
+	div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
+	__cputime_usec_factor = res.result_low;
+}
+
+/*
+ * Read the SPURR on systems that have it, otherwise the PURR,
+ * or if that doesn't exist return the timebase value passed in.
+ */
+static unsigned long read_spurr(unsigned long tb)
+{
+	if (cpu_has_feature(CPU_FTR_SPURR))
+		return mfspr(SPRN_SPURR);
+	if (cpu_has_feature(CPU_FTR_PURR))
+		return mfspr(SPRN_PURR);
+	return tb;
+}
+
+#ifdef CONFIG_PPC_SPLPAR
+
+/*
+ * Scan the dispatch trace log and count up the stolen time.
+ * Should be called with interrupts disabled.
+ */
+static u64 scan_dispatch_log(u64 stop_tb)
+{
+	u64 i = local_paca->dtl_ridx;
+	struct dtl_entry *dtl = local_paca->dtl_curr;
+	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
+	struct lppaca *vpa = local_paca->lppaca_ptr;
+	u64 tb_delta;
+	u64 stolen = 0;
+	u64 dtb;
+
+	if (!dtl)
+		return 0;
+
+	if (i == be64_to_cpu(vpa->dtl_idx))
+		return 0;
+	while (i < be64_to_cpu(vpa->dtl_idx)) {
+		dtb = be64_to_cpu(dtl->timebase);
+		tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
+			be32_to_cpu(dtl->ready_to_enqueue_time);
+		barrier();
+		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
+			/* buffer has overflowed */
+			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
+			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
+			continue;
+		}
+		if (dtb > stop_tb)
+			break;
+		if (dtl_consumer)
+			dtl_consumer(dtl, i);
+		stolen += tb_delta;
+		++i;
+		++dtl;
+		if (dtl == dtl_end)
+			dtl = local_paca->dispatch_log;
+	}
+	local_paca->dtl_ridx = i;
+	local_paca->dtl_curr = dtl;
+	return stolen;
+}
+
+/*
+ * Accumulate stolen time by scanning the dispatch trace log.
+ * Called on entry from user mode.
+ */
+void notrace accumulate_stolen_time(void)
+{
+	u64 sst, ust;
+	unsigned long save_irq_soft_mask = irq_soft_mask_return();
+	struct cpu_accounting_data *acct = &local_paca->accounting;
+
+	/* We are called early in the exception entry, before
+	 * soft/hard_enabled are sync'ed to the expected state
+	 * for the exception. We are hard disabled but the PACA
+	 * needs to reflect that so various debug stuff doesn't
+	 * complain
+	 */
+	irq_soft_mask_set(IRQS_DISABLED);
+
+	sst = scan_dispatch_log(acct->starttime_user);
+	ust = scan_dispatch_log(acct->starttime);
+	acct->stime -= sst;
+	acct->utime -= ust;
+	acct->steal_time += ust + sst;
+
+	irq_soft_mask_set(save_irq_soft_mask);
+}
+
+static inline u64 calculate_stolen_time(u64 stop_tb)
+{
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return 0;
+
+	if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx))
+		return scan_dispatch_log(stop_tb);
+
+	return 0;
+}
+
+#else /* CONFIG_PPC_SPLPAR */
+static inline u64 calculate_stolen_time(u64 stop_tb)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PPC_SPLPAR */
+
+/*
+ * Account time for a transition between system, hard irq
+ * or soft irq state.
+ */
+static unsigned long vtime_delta(struct task_struct *tsk,
+				 unsigned long *stime_scaled,
+				 unsigned long *steal_time)
+{
+	unsigned long now, nowscaled, deltascaled;
+	unsigned long stime;
+	unsigned long utime, utime_scaled;
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	now = mftb();
+	nowscaled = read_spurr(now);
+	stime = now - acct->starttime;
+	acct->starttime = now;
+	deltascaled = nowscaled - acct->startspurr;
+	acct->startspurr = nowscaled;
+
+	*steal_time = calculate_stolen_time(now);
+
+	utime = acct->utime - acct->utime_sspurr;
+	acct->utime_sspurr = acct->utime;
+
+	/*
+	 * Because we don't read the SPURR on every kernel entry/exit,
+	 * deltascaled includes both user and system SPURR ticks.
+	 * Apportion these ticks to system SPURR ticks and user
+	 * SPURR ticks in the same ratio as the system time (delta)
+	 * and user time (udelta) values obtained from the timebase
+	 * over the same interval.  The system ticks get accounted here;
+	 * the user ticks get saved up in paca->user_time_scaled to be
+	 * used by account_process_tick.
+	 */
+	*stime_scaled = stime;
+	utime_scaled = utime;
+	if (deltascaled != stime + utime) {
+		if (utime) {
+			*stime_scaled = deltascaled * stime / (stime + utime);
+			utime_scaled = deltascaled - *stime_scaled;
+		} else {
+			*stime_scaled = deltascaled;
+		}
+	}
+	acct->utime_scaled += utime_scaled;
+
+	return stime;
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+	unsigned long stime, stime_scaled, steal_time;
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+
+	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
+
+	stime -= min(stime, steal_time);
+	acct->steal_time += steal_time;
+
+	if ((tsk->flags & PF_VCPU) && !irq_count()) {
+		acct->gtime += stime;
+		acct->utime_scaled += stime_scaled;
+	} else {
+		if (hardirq_count())
+			acct->hardirq_time += stime;
+		else if (in_serving_softirq())
+			acct->softirq_time += stime;
+		else
+			acct->stime += stime;
+
+		acct->stime_scaled += stime_scaled;
+	}
+}
+EXPORT_SYMBOL_GPL(vtime_account_system);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+	unsigned long stime, stime_scaled, steal_time;
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+
+	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
+	acct->idle_time += stime + steal_time;
+}
+
+/*
+ * Account the whole cputime accumulated in the paca
+ * Must be called with interrupts disabled.
+ * Assumes that vtime_account_system/idle() has been called
+ * recently (i.e. since the last entry from usermode) so that
+ * get_paca()->user_time_scaled is up to date.
+ */
+void vtime_flush(struct task_struct *tsk)
+{
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+
+	if (acct->utime)
+		account_user_time(tsk, cputime_to_nsecs(acct->utime));
+
+	if (acct->utime_scaled)
+		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
+
+	if (acct->gtime)
+		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
+
+	if (acct->steal_time)
+		account_steal_time(cputime_to_nsecs(acct->steal_time));
+
+	if (acct->idle_time)
+		account_idle_time(cputime_to_nsecs(acct->idle_time));
+
+	if (acct->stime)
+		account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
+					  CPUTIME_SYSTEM);
+	if (acct->stime_scaled)
+		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
+
+	if (acct->hardirq_time)
+		account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
+					  CPUTIME_IRQ);
+	if (acct->softirq_time)
+		account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
+					  CPUTIME_SOFTIRQ);
+
+	acct->utime = 0;
+	acct->utime_scaled = 0;
+	acct->utime_sspurr = 0;
+	acct->gtime = 0;
+	acct->steal_time = 0;
+	acct->idle_time = 0;
+	acct->stime = 0;
+	acct->stime_scaled = 0;
+	acct->hardirq_time = 0;
+	acct->softirq_time = 0;
+}
+
+#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+#define calc_cputime_factors()
+#endif
+
+void __delay(unsigned long loops)
+{
+	unsigned long start;
+	int diff;
+
+	spin_begin();
+	if (__USE_RTC()) {
+		start = get_rtcl();
+		do {
+			/* the RTCL register wraps at 1000000000 */
+			diff = get_rtcl() - start;
+			if (diff < 0)
+				diff += 1000000000;
+			spin_cpu_relax();
+		} while (diff < loops);
+	} else {
+		start = get_tbl();
+		while (get_tbl() - start < loops)
+			spin_cpu_relax();
+	}
+	spin_end();
+}
+EXPORT_SYMBOL(__delay);
+
+void udelay(unsigned long usecs)
+{
+	__delay(tb_ticks_per_usec * usecs);
+}
+EXPORT_SYMBOL(udelay);
+
+#ifdef CONFIG_SMP
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (in_lock_functions(pc))
+		return regs->link;
+
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
+#ifdef CONFIG_IRQ_WORK
+
+/*
+ * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
+ */
+#ifdef CONFIG_PPC64
+static inline unsigned long test_irq_work_pending(void)
+{
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, irq_work_pending)));
+	return x;
+}
+
+static inline void set_irq_work_pending_flag(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (1),
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
+}
+
+static inline void clear_irq_work_pending(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (0),
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
+}
+
+#else /* 32-bit */
+
+DEFINE_PER_CPU(u8, irq_work_pending);
+
+#define set_irq_work_pending_flag()	__this_cpu_write(irq_work_pending, 1)
+#define test_irq_work_pending()		__this_cpu_read(irq_work_pending)
+#define clear_irq_work_pending()	__this_cpu_write(irq_work_pending, 0)
+
+#endif /* 32 vs 64 bit */
+
+void arch_irq_work_raise(void)
+{
+	/*
+	 * 64-bit code that uses irq soft-mask can just cause an immediate
+	 * interrupt here that gets soft masked, if this is called under
+	 * local_irq_disable(). It might be possible to prevent that happening
+	 * by noticing interrupts are disabled and setting decrementer pending
+	 * to be replayed when irqs are enabled. The problem there is that
+	 * tracing can call irq_work_raise, including in code that does low
+	 * level manipulations of irq soft-mask state (e.g., trace_hardirqs_on)
+	 * which could get tangled up if we're messing with the same state
+	 * here.
+	 */
+	preempt_disable();
+	set_irq_work_pending_flag();
+	set_dec(1);
+	preempt_enable();
+}
+
+#else  /* CONFIG_IRQ_WORK */
+
+#define test_irq_work_pending()	0
+#define clear_irq_work_pending()
+
+#endif /* CONFIG_IRQ_WORK */
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+void timer_interrupt(struct pt_regs *regs)
+{
+	struct clock_event_device *evt = this_cpu_ptr(&decrementers);
+	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+	struct pt_regs *old_regs;
+	u64 now;
+
+	/* Some implementations of hotplug will get timer interrupts while
+	 * offline, just ignore these and we also need to set
+	 * decrementers_next_tb as MAX to make sure __check_irq_replay
+	 * don't replay timer interrupt when return, otherwise we'll trap
+	 * here infinitely :(
+	 */
+	if (unlikely(!cpu_online(smp_processor_id()))) {
+		*next_tb = ~(u64)0;
+		set_dec(decrementer_max);
+		return;
+	}
+
+	/* Ensure a positive value is written to the decrementer, or else
+	 * some CPUs will continue to take decrementer exceptions. When the
+	 * PPC_WATCHDOG (decrementer based) is configured, keep this at most
+	 * 31 bits, which is about 4 seconds on most systems, which gives
+	 * the watchdog a chance of catching timer interrupt hard lockups.
+	 */
+	if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
+		set_dec(0x7fffffff);
+	else
+		set_dec(decrementer_max);
+
+	/* Conditionally hard-enable interrupts now that the DEC has been
+	 * bumped to its maximum value
+	 */
+	may_hard_irq_enable();
+
+
+#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
+	if (atomic_read(&ppc_n_lost_interrupts) != 0)
+		do_IRQ(regs);
+#endif
+
+	old_regs = set_irq_regs(regs);
+	irq_enter();
+	trace_timer_interrupt_entry(regs);
+
+	if (test_irq_work_pending()) {
+		clear_irq_work_pending();
+		irq_work_run();
+	}
+
+	now = get_tb_or_rtc();
+	if (now >= *next_tb) {
+		*next_tb = ~(u64)0;
+		if (evt->event_handler)
+			evt->event_handler(evt);
+		__this_cpu_inc(irq_stat.timer_irqs_event);
+	} else {
+		now = *next_tb - now;
+		if (now <= decrementer_max)
+			set_dec(now);
+		/* We may have raced with new irq work */
+		if (test_irq_work_pending())
+			set_dec(1);
+		__this_cpu_inc(irq_stat.timer_irqs_others);
+	}
+
+	trace_timer_interrupt_exit(regs);
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+EXPORT_SYMBOL(timer_interrupt);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void timer_broadcast_interrupt(void)
+{
+	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+
+	*next_tb = ~(u64)0;
+	tick_receive_broadcast();
+	__this_cpu_inc(irq_stat.broadcast_irqs_event);
+}
+#endif
+
+/*
+ * Hypervisor decrementer interrupts shouldn't occur but are sometimes
+ * left pending on exit from a KVM guest.  We don't need to do anything
+ * to clear them, as they are edge-triggered.
+ */
+void hdec_interrupt(struct pt_regs *regs)
+{
+}
+
+#ifdef CONFIG_SUSPEND
+static void generic_suspend_disable_irqs(void)
+{
+	/* Disable the decrementer, so that it doesn't interfere
+	 * with suspending.
+	 */
+
+	set_dec(decrementer_max);
+	local_irq_disable();
+	set_dec(decrementer_max);
+}
+
+static void generic_suspend_enable_irqs(void)
+{
+	local_irq_enable();
+}
+
+/* Overrides the weak version in kernel/power/main.c */
+void arch_suspend_disable_irqs(void)
+{
+	if (ppc_md.suspend_disable_irqs)
+		ppc_md.suspend_disable_irqs();
+	generic_suspend_disable_irqs();
+}
+
+/* Overrides the weak version in kernel/power/main.c */
+void arch_suspend_enable_irqs(void)
+{
+	generic_suspend_enable_irqs();
+	if (ppc_md.suspend_enable_irqs)
+		ppc_md.suspend_enable_irqs();
+}
+#endif
+
+unsigned long long tb_to_ns(unsigned long long ticks)
+{
+	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
+}
+EXPORT_SYMBOL_GPL(tb_to_ns);
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ *
+ * Note: mulhdu(a, b) (multiply high double unsigned) returns
+ * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
+ * are 64-bit unsigned numbers.
+ */
+notrace unsigned long long sched_clock(void)
+{
+	if (__USE_RTC())
+		return get_rtc();
+	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+}
+
+
+#ifdef CONFIG_PPC_PSERIES
+
+/*
+ * Running clock - attempts to give a view of time passing for a virtualised
+ * kernels.
+ * Uses the VTB register if available otherwise a next best guess.
+ */
+unsigned long long running_clock(void)
+{
+	/*
+	 * Don't read the VTB as a host since KVM does not switch in host
+	 * timebase into the VTB when it takes a guest off the CPU, reading the
+	 * VTB would result in reading 'last switched out' guest VTB.
+	 *
+	 * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it
+	 * would be unsafe to rely only on the #ifdef above.
+	 */
+	if (firmware_has_feature(FW_FEATURE_LPAR) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S))
+		return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+
+	/*
+	 * This is a next best approximation without a VTB.
+	 * On a host which is running bare metal there should never be any stolen
+	 * time and on a host which doesn't do any virtualisation TB *should* equal
+	 * VTB so it makes no difference anyway.
+	 */
+	return local_clock() - kcpustat_this_cpu->cpustat[CPUTIME_STEAL];
+}
+#endif
+
+static int __init get_freq(char *name, int cells, unsigned long *val)
+{
+	struct device_node *cpu;
+	const __be32 *fp;
+	int found = 0;
+
+	/* The cpu node should have timebase and clock frequency properties */
+	cpu = of_find_node_by_type(NULL, "cpu");
+
+	if (cpu) {
+		fp = of_get_property(cpu, name, NULL);
+		if (fp) {
+			found = 1;
+			*val = of_read_ulong(fp, cells);
+		}
+
+		of_node_put(cpu);
+	}
+
+	return found;
+}
+
+static void start_cpu_decrementer(void)
+{
+#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+	unsigned int tcr;
+
+	/* Clear any pending timer interrupts */
+	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
+
+	tcr = mfspr(SPRN_TCR);
+	/*
+	 * The watchdog may have already been enabled by u-boot. So leave
+	 * TRC[WP] (Watchdog Period) alone.
+	 */
+	tcr &= TCR_WP_MASK;	/* Clear all bits except for TCR[WP] */
+	tcr |= TCR_DIE;		/* Enable decrementer */
+	mtspr(SPRN_TCR, tcr);
+#endif
+}
+
+void __init generic_calibrate_decr(void)
+{
+	ppc_tb_freq = DEFAULT_TB_FREQ;		/* hardcoded default */
+
+	if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) &&
+	    !get_freq("timebase-frequency", 1, &ppc_tb_freq)) {
+
+		printk(KERN_ERR "WARNING: Estimating decrementer frequency "
+				"(not found)\n");
+	}
+
+	ppc_proc_freq = DEFAULT_PROC_FREQ;	/* hardcoded default */
+
+	if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) &&
+	    !get_freq("clock-frequency", 1, &ppc_proc_freq)) {
+
+		printk(KERN_ERR "WARNING: Estimating processor frequency "
+				"(not found)\n");
+	}
+}
+
+int update_persistent_clock64(struct timespec64 now)
+{
+	struct rtc_time tm;
+
+	if (!ppc_md.set_rtc_time)
+		return -ENODEV;
+
+	rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm);
+
+	return ppc_md.set_rtc_time(&tm);
+}
+
+static void __read_persistent_clock(struct timespec64 *ts)
+{
+	struct rtc_time tm;
+	static int first = 1;
+
+	ts->tv_nsec = 0;
+	/* XXX this is a litle fragile but will work okay in the short term */
+	if (first) {
+		first = 0;
+		if (ppc_md.time_init)
+			timezone_offset = ppc_md.time_init();
+
+		/* get_boot_time() isn't guaranteed to be safe to call late */
+		if (ppc_md.get_boot_time) {
+			ts->tv_sec = ppc_md.get_boot_time() - timezone_offset;
+			return;
+		}
+	}
+	if (!ppc_md.get_rtc_time) {
+		ts->tv_sec = 0;
+		return;
+	}
+	ppc_md.get_rtc_time(&tm);
+
+	ts->tv_sec = rtc_tm_to_time64(&tm);
+}
+
+void read_persistent_clock64(struct timespec64 *ts)
+{
+	__read_persistent_clock(ts);
+
+	/* Sanitize it in case real time clock is set below EPOCH */
+	if (ts->tv_sec < 0) {
+		ts->tv_sec = 0;
+		ts->tv_nsec = 0;
+	}
+		
+}
+
+/* clocksource code */
+static notrace u64 rtc_read(struct clocksource *cs)
+{
+	return (u64)get_rtc();
+}
+
+static notrace u64 timebase_read(struct clocksource *cs)
+{
+	return (u64)get_tb();
+}
+
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct timespec xt;
+	struct clocksource *clock = tk->tkr_mono.clock;
+	u32 mult = tk->tkr_mono.mult;
+	u32 shift = tk->tkr_mono.shift;
+	u64 cycle_last = tk->tkr_mono.cycle_last;
+	u64 new_tb_to_xs, new_stamp_xsec;
+	u64 frac_sec;
+
+	if (clock != &clocksource_timebase)
+		return;
+
+	xt.tv_sec = tk->xtime_sec;
+	xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+
+	/* Make userspace gettimeofday spin until we're done. */
+	++vdso_data->tb_update_count;
+	smp_mb();
+
+	/*
+	 * This computes ((2^20 / 1e9) * mult) >> shift as a
+	 * 0.64 fixed-point fraction.
+	 * The computation in the else clause below won't overflow
+	 * (as long as the timebase frequency is >= 1.049 MHz)
+	 * but loses precision because we lose the low bits of the constant
+	 * in the shift.  Note that 19342813113834067 ~= 2^(20+64) / 1e9.
+	 * For a shift of 24 the error is about 0.5e-9, or about 0.5ns
+	 * over a second.  (Shift values are usually 22, 23 or 24.)
+	 * For high frequency clocks such as the 512MHz timebase clock
+	 * on POWER[6789], the mult value is small (e.g. 32768000)
+	 * and so we can shift the constant by 16 initially
+	 * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
+	 * remaining shifts after the multiplication, which gives a
+	 * more accurate result (e.g. with mult = 32768000, shift = 24,
+	 * the error is only about 1.2e-12, or 0.7ns over 10 minutes).
+	 */
+	if (mult <= 62500000 && clock->shift >= 16)
+		new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
+	else
+		new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
+
+	/*
+	 * Compute the fractional second in units of 2^-32 seconds.
+	 * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
+	 * in nanoseconds, so multiplying that by 2^32 / 1e9 gives
+	 * it in units of 2^-32 seconds.
+	 * We assume shift <= 32 because clocks_calc_mult_shift()
+	 * generates shift values in the range 0 - 32.
+	 */
+	frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
+	do_div(frac_sec, NSEC_PER_SEC);
+
+	/*
+	 * Work out new stamp_xsec value for any legacy users of systemcfg.
+	 * stamp_xsec is in units of 2^-20 seconds.
+	 */
+	new_stamp_xsec = frac_sec >> 12;
+	new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
+
+	/*
+	 * tb_update_count is used to allow the userspace gettimeofday code
+	 * to assure itself that it sees a consistent view of the tb_to_xs and
+	 * stamp_xsec variables.  It reads the tb_update_count, then reads
+	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
+	 * the two values of tb_update_count match and are even then the
+	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
+	 * loops back and reads them again until this criteria is met.
+	 */
+	vdso_data->tb_orig_stamp = cycle_last;
+	vdso_data->stamp_xsec = new_stamp_xsec;
+	vdso_data->tb_to_xs = new_tb_to_xs;
+	vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
+	vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
+	vdso_data->stamp_xtime = xt;
+	vdso_data->stamp_sec_fraction = frac_sec;
+	vdso_data->hrtimer_res = hrtimer_resolution;
+	smp_wmb();
+	++(vdso_data->tb_update_count);
+}
+
+void update_vsyscall_tz(void)
+{
+	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+}
+
+static void __init clocksource_init(void)
+{
+	struct clocksource *clock;
+
+	if (__USE_RTC())
+		clock = &clocksource_rtc;
+	else
+		clock = &clocksource_timebase;
+
+	if (clocksource_register_hz(clock, tb_ticks_per_sec)) {
+		printk(KERN_ERR "clocksource: %s is already registered\n",
+		       clock->name);
+		return;
+	}
+
+	printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n",
+	       clock->name, clock->mult, clock->shift);
+}
+
+static int decrementer_set_next_event(unsigned long evt,
+				      struct clock_event_device *dev)
+{
+	__this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt);
+	set_dec(evt);
+
+	/* We may have raced with new irq work */
+	if (test_irq_work_pending())
+		set_dec(1);
+
+	return 0;
+}
+
+static int decrementer_shutdown(struct clock_event_device *dev)
+{
+	decrementer_set_next_event(decrementer_max, dev);
+	return 0;
+}
+
+static void register_decrementer_clockevent(int cpu)
+{
+	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
+
+	*dec = decrementer_clockevent;
+	dec->cpumask = cpumask_of(cpu);
+
+	clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max);
+
+	printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
+		    dec->name, dec->mult, dec->shift, cpu);
+
+	/* Set values for KVM, see kvm_emulate_dec() */
+	decrementer_clockevent.mult = dec->mult;
+	decrementer_clockevent.shift = dec->shift;
+}
+
+static void enable_large_decrementer(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return;
+
+	if (decrementer_max <= DECREMENTER_DEFAULT_MAX)
+		return;
+
+	/*
+	 * If we're running as the hypervisor we need to enable the LD manually
+	 * otherwise firmware should have done it for us.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_LD);
+}
+
+static void __init set_decrementer_max(void)
+{
+	struct device_node *cpu;
+	u32 bits = 32;
+
+	/* Prior to ISAv3 the decrementer is always 32 bit */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return;
+
+	cpu = of_find_node_by_type(NULL, "cpu");
+
+	if (of_property_read_u32(cpu, "ibm,dec-bits", &bits) == 0) {
+		if (bits > 64 || bits < 32) {
+			pr_warn("time_init: firmware supplied invalid ibm,dec-bits");
+			bits = 32;
+		}
+
+		/* calculate the signed maximum given this many bits */
+		decrementer_max = (1ul << (bits - 1)) - 1;
+	}
+
+	of_node_put(cpu);
+
+	pr_info("time_init: %u bit decrementer (max: %llx)\n",
+		bits, decrementer_max);
+}
+
+static void __init init_decrementer_clockevent(void)
+{
+	register_decrementer_clockevent(smp_processor_id());
+}
+
+void secondary_cpu_time_init(void)
+{
+	/* Enable and test the large decrementer for this cpu */
+	enable_large_decrementer();
+
+	/* Start the decrementer on CPUs that have manual control
+	 * such as BookE
+	 */
+	start_cpu_decrementer();
+
+	/* FIME: Should make unrelatred change to move snapshot_timebase
+	 * call here ! */
+	register_decrementer_clockevent(smp_processor_id());
+}
+
+/* This function is only called on the boot processor */
+void __init time_init(void)
+{
+	struct div_result res;
+	u64 scale;
+	unsigned shift;
+
+	if (__USE_RTC()) {
+		/* 601 processor: dec counts down by 128 every 128ns */
+		ppc_tb_freq = 1000000000;
+	} else {
+		/* Normal PowerPC with timebase register */
+		ppc_md.calibrate_decr();
+		printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n",
+		       ppc_tb_freq / 1000000, ppc_tb_freq % 1000000);
+		printk(KERN_DEBUG "time_init: processor frequency   = %lu.%.6lu MHz\n",
+		       ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
+	}
+
+	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
+	tb_ticks_per_sec = ppc_tb_freq;
+	tb_ticks_per_usec = ppc_tb_freq / 1000000;
+	calc_cputime_factors();
+
+	/*
+	 * Compute scale factor for sched_clock.
+	 * The calibrate_decr() function has set tb_ticks_per_sec,
+	 * which is the timebase frequency.
+	 * We compute 1e9 * 2^64 / tb_ticks_per_sec and interpret
+	 * the 128-bit result as a 64.64 fixed-point number.
+	 * We then shift that number right until it is less than 1.0,
+	 * giving us the scale factor and shift count to use in
+	 * sched_clock().
+	 */
+	div128_by_32(1000000000, 0, tb_ticks_per_sec, &res);
+	scale = res.result_low;
+	for (shift = 0; res.result_high != 0; ++shift) {
+		scale = (scale >> 1) | (res.result_high << 63);
+		res.result_high >>= 1;
+	}
+	tb_to_ns_scale = scale;
+	tb_to_ns_shift = shift;
+	/* Save the current timebase to pretty up CONFIG_PRINTK_TIME */
+	boot_tb = get_tb_or_rtc();
+
+	/* If platform provided a timezone (pmac), we correct the time */
+	if (timezone_offset) {
+		sys_tz.tz_minuteswest = -timezone_offset / 60;
+		sys_tz.tz_dsttime = 0;
+	}
+
+	vdso_data->tb_update_count = 0;
+	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
+
+	/* initialise and enable the large decrementer (if we have one) */
+	set_decrementer_max();
+	enable_large_decrementer();
+
+	/* Start the decrementer on CPUs that have manual control
+	 * such as BookE
+	 */
+	start_cpu_decrementer();
+
+	/* Register the clocksource */
+	clocksource_init();
+
+	init_decrementer_clockevent();
+	tick_setup_hrtimer_broadcast();
+
+#ifdef CONFIG_COMMON_CLK
+	of_clk_init(NULL);
+#endif
+}
+
+/*
+ * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
+ * result.
+ */
+void div128_by_32(u64 dividend_high, u64 dividend_low,
+		  unsigned divisor, struct div_result *dr)
+{
+	unsigned long a, b, c, d;
+	unsigned long w, x, y, z;
+	u64 ra, rb, rc;
+
+	a = dividend_high >> 32;
+	b = dividend_high & 0xffffffff;
+	c = dividend_low >> 32;
+	d = dividend_low & 0xffffffff;
+
+	w = a / divisor;
+	ra = ((u64)(a - (w * divisor)) << 32) + b;
+
+	rb = ((u64) do_div(ra, divisor) << 32) + c;
+	x = ra;
+
+	rc = ((u64) do_div(rb, divisor) << 32) + d;
+	y = rb;
+
+	do_div(rc, divisor);
+	z = rc;
+
+	dr->result_high = ((u64)w << 32) + x;
+	dr->result_low  = ((u64)y << 32) + z;
+
+}
+
+/* We don't need to calibrate delay, we use the CPU timebase for that */
+void calibrate_delay(void)
+{
+	/* Some generic code (such as spinlock debug) use loops_per_jiffy
+	 * as the number of __delay(1) in a jiffy, so make it so
+	 */
+	loops_per_jiffy = tb_ticks_per_jiffy;
+}
+
+#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
+static int rtc_generic_get_time(struct device *dev, struct rtc_time *tm)
+{
+	ppc_md.get_rtc_time(tm);
+	return 0;
+}
+
+static int rtc_generic_set_time(struct device *dev, struct rtc_time *tm)
+{
+	if (!ppc_md.set_rtc_time)
+		return -EOPNOTSUPP;
+
+	if (ppc_md.set_rtc_time(tm) < 0)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static const struct rtc_class_ops rtc_generic_ops = {
+	.read_time = rtc_generic_get_time,
+	.set_time = rtc_generic_set_time,
+};
+
+static int __init rtc_init(void)
+{
+	struct platform_device *pdev;
+
+	if (!ppc_md.get_rtc_time)
+		return -ENODEV;
+
+	pdev = platform_device_register_data(NULL, "rtc-generic", -1,
+					     &rtc_generic_ops,
+					     sizeof(rtc_generic_ops));
+
+	return PTR_ERR_OR_ZERO(pdev);
+}
+
+device_initcall(rtc_init);
+#endif
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
new file mode 100644
index 000000000..771637478
--- /dev/null
+++ b/arch/powerpc/kernel/tm.S
@@ -0,0 +1,516 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Transactional memory support routines to reclaim and recheckpoint
+ * transactional process state.
+ *
+ * Copyright 2012 Matt Evans & Michael Neuling, IBM Corporation.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/bug.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+
+#ifdef CONFIG_VSX
+/* See fpu.S, this is borrowed from there */
+#define __SAVE_32FPRS_VSRS(n,c,base)		\
+BEGIN_FTR_SECTION				\
+	b	2f;				\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);		\
+	SAVE_32FPRS(n,base);			\
+	b	3f;				\
+2:	SAVE_32VSRS(n,c,base);			\
+3:
+#define __REST_32FPRS_VSRS(n,c,base)		\
+BEGIN_FTR_SECTION				\
+	b	2f;				\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);		\
+	REST_32FPRS(n,base);			\
+	b	3f;				\
+2:	REST_32VSRS(n,c,base);			\
+3:
+#else
+#define __SAVE_32FPRS_VSRS(n,c,base)	SAVE_32FPRS(n, base)
+#define __REST_32FPRS_VSRS(n,c,base)	REST_32FPRS(n, base)
+#endif
+#define SAVE_32FPRS_VSRS(n,c,base) \
+	__SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base)
+#define REST_32FPRS_VSRS(n,c,base) \
+	__REST_32FPRS_VSRS(n,__REG_##c,__REG_##base)
+
+/* Stack frame offsets for local variables. */
+#define TM_FRAME_L0	TM_FRAME_SIZE-16
+#define TM_FRAME_L1	TM_FRAME_SIZE-8
+
+
+/* In order to access the TM SPRs, TM must be enabled.  So, do so: */
+_GLOBAL(tm_enable)
+	mfmsr	r4
+	li	r3, MSR_TM >> 32
+	sldi	r3, r3, 32
+	and.	r0, r4, r3
+	bne	1f
+	or	r4, r4, r3
+	mtmsrd	r4
+1:	blr
+EXPORT_SYMBOL_GPL(tm_enable);
+
+_GLOBAL(tm_disable)
+	mfmsr	r4
+	li	r3, MSR_TM >> 32
+	sldi	r3, r3, 32
+	andc	r4, r4, r3
+	mtmsrd	r4
+	blr
+EXPORT_SYMBOL_GPL(tm_disable);
+
+_GLOBAL(tm_save_sprs)
+	mfspr	r0, SPRN_TFHAR
+	std	r0, THREAD_TM_TFHAR(r3)
+	mfspr	r0, SPRN_TEXASR
+	std	r0, THREAD_TM_TEXASR(r3)
+	mfspr	r0, SPRN_TFIAR
+	std	r0, THREAD_TM_TFIAR(r3)
+	blr
+
+_GLOBAL(tm_restore_sprs)
+	ld	r0, THREAD_TM_TFHAR(r3)
+	mtspr	SPRN_TFHAR, r0
+	ld	r0, THREAD_TM_TEXASR(r3)
+	mtspr	SPRN_TEXASR, r0
+	ld	r0, THREAD_TM_TFIAR(r3)
+	mtspr	SPRN_TFIAR, r0
+	blr
+
+	/* Passed an 8-bit failure cause as first argument. */
+_GLOBAL(tm_abort)
+	TABORT(R3)
+	blr
+EXPORT_SYMBOL_GPL(tm_abort);
+
+/* void tm_reclaim(struct thread_struct *thread,
+ *		   uint8_t cause)
+ *
+ *	- Performs a full reclaim.  This destroys outstanding
+ *	  transactions and updates thread->regs.tm_ckpt_* with the
+ *	  original checkpointed state.  Note that thread->regs is
+ *	  unchanged.
+ *
+ * Purpose is to both abort transactions of, and preserve the state of,
+ * a transactions at a context switch. We preserve/restore both sets of process
+ * state to restore them when the thread's scheduled again.  We continue in
+ * userland as though nothing happened, but when the transaction is resumed
+ * they will abort back to the checkpointed state we save out here.
+ *
+ * Call with IRQs off, stacks get all out of sync for some periods in here!
+ */
+_GLOBAL(tm_reclaim)
+	mfcr	r5
+	mflr	r0
+	stw	r5, 8(r1)
+	std	r0, 16(r1)
+	std	r2, STK_GOT(r1)
+	stdu	r1, -TM_FRAME_SIZE(r1)
+
+	/* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */
+
+	std	r3, STK_PARAM(R3)(r1)
+	SAVE_NVGPRS(r1)
+
+	/* We need to setup MSR for VSX register save instructions. */
+	mfmsr	r14
+	mr	r15, r14
+	ori	r15, r15, MSR_FP
+	li	r16, 0
+	ori	r16, r16, MSR_EE /* IRQs hard off */
+	andc	r15, r15, r16
+	oris	r15, r15, MSR_VEC@h
+#ifdef CONFIG_VSX
+	BEGIN_FTR_SECTION
+	oris	r15,r15, MSR_VSX@h
+	END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r15
+	std	r14, TM_FRAME_L0(r1)
+
+	/* Do sanity check on MSR to make sure we are suspended */
+	li	r7, (MSR_TS_S)@higher
+	srdi	r6, r14, 32
+	and	r6, r6, r7
+1:	tdeqi   r6, 0
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+
+	/* Stash the stack pointer away for use after reclaim */
+	std	r1, PACAR1(r13)
+
+	/* Clear MSR RI since we are about to change r1, EE is already off. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/*
+	 * BE CAREFUL HERE:
+	 * At this point we can't take an SLB miss since we have MSR_RI
+	 * off. Load only to/from the stack/paca which are in SLB bolted regions
+	 * until we turn MSR RI back on.
+	 *
+	 * The moment we treclaim, ALL of our GPRs will switch
+	 * to user register state.  (FPRs, CCR etc. also!)
+	 * Use an sprg and a tm_scratch in the PACA to shuffle.
+	 */
+	TRECLAIM(R4)				/* Cause in r4 */
+
+	/* ******************** GPRs ******************** */
+	/* Stash the checkpointed r13 away in the scratch SPR and get the real
+	 *  paca
+	 */
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+
+	/* Stash the checkpointed r1 away in paca tm_scratch and get the real
+	 * stack pointer back
+	 */
+	std	r1, PACATMSCRATCH(r13)
+	ld	r1, PACAR1(r13)
+
+	std	r11, GPR11(r1)			/* Temporary stash */
+
+	/*
+	 * Move the saved user r1 to the kernel stack in case PACATMSCRATCH is
+	 * clobbered by an exception once we turn on MSR_RI below.
+	 */
+	ld	r11, PACATMSCRATCH(r13)
+	std	r11, GPR1(r1)
+
+	/*
+	 * Store r13 away so we can free up the scratch SPR for the SLB fault
+	 * handler (needed once we start accessing the thread_struct).
+	 */
+	GET_SCRATCH0(r11)
+	std	r11, GPR13(r1)
+
+	/* Reset MSR RI so we can take SLB faults again */
+	li	r11, MSR_RI
+	mtmsrd	r11, 1
+
+	/* Store the PPR in r11 and reset to decent value */
+	mfspr	r11, SPRN_PPR
+	HMT_MEDIUM
+
+	/* Now get some more GPRS free */
+	std	r7, GPR7(r1)			/* Temporary stash */
+	std	r12, GPR12(r1)			/* ''   ''    ''   */
+	ld	r12, STK_PARAM(R3)(r1)		/* Param 0, thread_struct * */
+
+	std	r11, THREAD_TM_PPR(r12)		/* Store PPR and free r11 */
+
+	addi	r7, r12, PT_CKPT_REGS		/* Thread's ckpt_regs */
+
+	/* Make r7 look like an exception frame so that we
+	 * can use the neat GPRx(n) macros.  r7 is NOT a pt_regs ptr!
+	 */
+	subi	r7, r7, STACK_FRAME_OVERHEAD
+
+	/* Sync the userland GPRs 2-12, 14-31 to thread->regs: */
+	SAVE_GPR(0, r7)				/* user r0 */
+	SAVE_GPR(2, r7)			/* user r2 */
+	SAVE_4GPRS(3, r7)			/* user r3-r6 */
+	SAVE_GPR(8, r7)				/* user r8 */
+	SAVE_GPR(9, r7)				/* user r9 */
+	SAVE_GPR(10, r7)			/* user r10 */
+	ld	r3, GPR1(r1)			/* user r1 */
+	ld	r4, GPR7(r1)			/* user r7 */
+	ld	r5, GPR11(r1)			/* user r11 */
+	ld	r6, GPR12(r1)			/* user r12 */
+	ld	r8, GPR13(r1)			/* user r13 */
+	std	r3, GPR1(r7)
+	std	r4, GPR7(r7)
+	std	r5, GPR11(r7)
+	std	r6, GPR12(r7)
+	std	r8, GPR13(r7)
+
+	SAVE_NVGPRS(r7)				/* user r14-r31 */
+
+	/* ******************** NIP ******************** */
+	mfspr	r3, SPRN_TFHAR
+	std	r3, _NIP(r7)			/* Returns to failhandler */
+	/* The checkpointed NIP is ignored when rescheduling/rechkpting,
+	 * but is used in signal return to 'wind back' to the abort handler.
+	 */
+
+	/* ******************** CR,LR,CCR,MSR ********** */
+	mfctr	r3
+	mflr	r4
+	mfcr	r5
+	mfxer	r6
+
+	std	r3, _CTR(r7)
+	std	r4, _LINK(r7)
+	std	r5, _CCR(r7)
+	std	r6, _XER(r7)
+
+
+	/* ******************** TAR, DSCR ********** */
+	mfspr	r3, SPRN_TAR
+	mfspr	r4, SPRN_DSCR
+
+	std	r3, THREAD_TM_TAR(r12)
+	std	r4, THREAD_TM_DSCR(r12)
+
+	/* MSR and flags:  We don't change CRs, and we don't need to alter
+	 * MSR.
+	 */
+
+
+	/* ******************** FPR/VR/VSRs ************
+	 * After reclaiming, capture the checkpointed FPRs/VRs.
+	 *
+	 * We enabled VEC/FP/VSX in the msr above, so we can execute these
+	 * instructions!
+	 */
+	mr	r3, r12
+
+	/* Altivec (VEC/VMX/VR)*/
+	addi	r7, r3, THREAD_CKVRSTATE
+	SAVE_32VRS(0, r6, r7)	/* r6 scratch, r7 transact vr state */
+	mfvscr	v0
+	li	r6, VRSTATE_VSCR
+	stvx	v0, r7, r6
+
+	/* VRSAVE */
+	mfspr	r0, SPRN_VRSAVE
+	std	r0, THREAD_CKVRSAVE(r3)
+
+	/* Floating Point (FP) */
+	addi	r7, r3, THREAD_CKFPSTATE
+	SAVE_32FPRS_VSRS(0, R6, R7)	/* r6 scratch, r7 transact fp state */
+	mffs    fr0
+	stfd    fr0,FPSTATE_FPSCR(r7)
+
+
+	/* TM regs, incl TEXASR -- these live in thread_struct.  Note they've
+	 * been updated by the treclaim, to explain to userland the failure
+	 * cause (aborted).
+	 */
+	mfspr	r0, SPRN_TEXASR
+	mfspr	r3, SPRN_TFHAR
+	mfspr	r4, SPRN_TFIAR
+	std	r0, THREAD_TM_TEXASR(r12)
+	std	r3, THREAD_TM_TFHAR(r12)
+	std	r4, THREAD_TM_TFIAR(r12)
+
+	/* AMR is checkpointed too, but is unsupported by Linux. */
+
+	/* Restore original MSR/IRQ state & clear TM mode */
+	ld	r14, TM_FRAME_L0(r1)		/* Orig MSR */
+
+	li	r15, 0
+	rldimi  r14, r15, MSR_TS_LG, (63-MSR_TS_LG)-1
+	mtmsrd  r14
+
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, TM_FRAME_SIZE
+	lwz	r4, 8(r1)
+	ld	r0, 16(r1)
+	mtcr	r4
+	mtlr	r0
+	ld	r2, STK_GOT(r1)
+
+	/* Load CPU's default DSCR */
+	ld	r0, PACA_DSCR_DEFAULT(r13)
+	mtspr	SPRN_DSCR, r0
+
+	blr
+
+
+       /*
+	 * void __tm_recheckpoint(struct thread_struct *thread)
+	 *	- Restore the checkpointed register state saved by tm_reclaim
+	 *	  when we switch_to a process.
+	 *
+	 *	Call with IRQs off, stacks get all out of sync for
+	 *	some periods in here!
+	 */
+_GLOBAL(__tm_recheckpoint)
+	mfcr	r5
+	mflr	r0
+	stw	r5, 8(r1)
+	std	r0, 16(r1)
+	std	r2, STK_GOT(r1)
+	stdu	r1, -TM_FRAME_SIZE(r1)
+
+	/* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD].
+	 * This is used for backing up the NVGPRs:
+	 */
+	SAVE_NVGPRS(r1)
+
+	/* Load complete register state from ts_ckpt* registers */
+
+	addi	r7, r3, PT_CKPT_REGS		/* Thread's ckpt_regs */
+
+	/* Make r7 look like an exception frame so that we
+	 * can use the neat GPRx(n) macros.  r7 is now NOT a pt_regs ptr!
+	 */
+	subi	r7, r7, STACK_FRAME_OVERHEAD
+
+	/* We need to setup MSR for FP/VMX/VSX register save instructions. */
+	mfmsr	r6
+	mr	r5, r6
+	ori	r5, r5, MSR_FP
+#ifdef CONFIG_ALTIVEC
+	oris	r5, r5, MSR_VEC@h
+#endif
+#ifdef CONFIG_VSX
+	BEGIN_FTR_SECTION
+	oris	r5,r5, MSR_VSX@h
+	END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r5
+
+#ifdef CONFIG_ALTIVEC
+	/*
+	 * FP and VEC registers: These are recheckpointed from
+	 * thread.ckfp_state and thread.ckvr_state respectively. The
+	 * thread.fp_state[] version holds the 'live' (transactional)
+	 * and will be loaded subsequently by any FPUnavailable trap.
+	 */
+	addi	r8, r3, THREAD_CKVRSTATE
+	li	r5, VRSTATE_VSCR
+	lvx	v0, r8, r5
+	mtvscr	v0
+	REST_32VRS(0, r5, r8)			/* r5 scratch, r8 ptr */
+	ld	r5, THREAD_CKVRSAVE(r3)
+	mtspr	SPRN_VRSAVE, r5
+#endif
+
+	addi	r8, r3, THREAD_CKFPSTATE
+	lfd	fr0, FPSTATE_FPSCR(r8)
+	MTFSF_L(fr0)
+	REST_32FPRS_VSRS(0, R4, R8)
+
+	mtmsr	r6				/* FP/Vec off again! */
+
+restore_gprs:
+
+	/* ******************** CR,LR,CCR,MSR ********** */
+	ld	r4, _CTR(r7)
+	ld	r5, _LINK(r7)
+	ld	r8, _XER(r7)
+
+	mtctr	r4
+	mtlr	r5
+	mtxer	r8
+
+	/* ******************** TAR ******************** */
+	ld	r4, THREAD_TM_TAR(r3)
+	mtspr	SPRN_TAR,	r4
+
+	/* Load up the PPR and DSCR in GPRs only at this stage */
+	ld	r5, THREAD_TM_DSCR(r3)
+	ld	r6, THREAD_TM_PPR(r3)
+
+	REST_GPR(0, r7)				/* GPR0 */
+	REST_2GPRS(2, r7)			/* GPR2-3 */
+	REST_GPR(4, r7)				/* GPR4 */
+	REST_4GPRS(8, r7)			/* GPR8-11 */
+	REST_2GPRS(12, r7)			/* GPR12-13 */
+
+	REST_NVGPRS(r7)				/* GPR14-31 */
+
+	/* Load up PPR and DSCR here so we don't run with user values for long
+	 */
+	mtspr	SPRN_DSCR, r5
+	mtspr	SPRN_PPR, r6
+
+	/* Do final sanity check on TEXASR to make sure FS is set.  Do this
+	 * here before we load up the userspace r1 so any bugs we hit will get
+	 * a call chain */
+	mfspr	r5, SPRN_TEXASR
+	srdi	r5, r5, 16
+	li	r6, (TEXASR_FS)@h
+	and	r6, r6, r5
+1:	tdeqi	r6, 0
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+
+	/* Do final sanity check on MSR to make sure we are not transactional
+	 * or suspended
+	 */
+	mfmsr   r6
+	li	r5, (MSR_TS_MASK)@higher
+	srdi	r6, r6, 32
+	and	r6, r6, r5
+1:	tdnei   r6, 0
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+
+	/* Restore CR */
+	ld	r6, _CCR(r7)
+	mtcr    r6
+
+	REST_GPR(6, r7)
+
+	/*
+	 * Store r1 and r5 on the stack so that we can access them
+	 * after we clear MSR RI.
+	 */
+
+	REST_GPR(5, r7)
+	std	r5, -8(r1)
+	ld	r5, GPR1(r7)
+	std	r5, -16(r1)
+
+	REST_GPR(7, r7)
+
+	/* Clear MSR RI since we are about to change r1. EE is already off */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/*
+	 * BE CAREFUL HERE:
+	 * At this point we can't take an SLB miss since we have MSR_RI
+	 * off. Load only to/from the stack/paca which are in SLB bolted regions
+	 * until we turn MSR RI back on.
+	 */
+
+	SET_SCRATCH0(r1)
+	ld	r5, -8(r1)
+	ld	r1, -16(r1)
+
+	/* Commit register state as checkpointed state: */
+	TRECHKPT
+
+	HMT_MEDIUM
+
+	/* Our transactional state has now changed.
+	 *
+	 * Now just get out of here.  Transactional (current) state will be
+	 * updated once restore is called on the return path in the _switch-ed
+	 * -to process.
+	 */
+
+	GET_PACA(r13)
+	GET_SCRATCH0(r1)
+
+	/* R1 is restored, so we are recoverable again.  EE is still off */
+	li	r4, MSR_RI
+	mtmsrd	r4, 1
+
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, TM_FRAME_SIZE
+	lwz	r4, 8(r1)
+	ld	r0, 16(r1)
+	mtcr	r4
+	mtlr	r0
+	ld	r2, STK_GOT(r1)
+
+	/* Load CPU's default DSCR */
+	ld	r0, PACA_DSCR_DEFAULT(r13)
+	mtspr	SPRN_DSCR, r0
+
+	blr
+
+	/* ****************************************************************** */
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
new file mode 100644
index 000000000..d868ba420
--- /dev/null
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the powerpc trace subsystem
+#
+
+subdir-ccflags-$(CONFIG_PPC_WERROR)	:= -Werror
+
+ifdef CONFIG_FUNCTION_TRACER
+# do not trace tracer code
+CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
+endif
+
+obj32-$(CONFIG_FUNCTION_TRACER)		+= ftrace_32.o
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64.o
+ifdef CONFIG_MPROFILE_KERNEL
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64_mprofile.o
+else
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64_pg.o
+endif
+obj-$(CONFIG_DYNAMIC_FTRACE)		+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS)		+= ftrace.o
+obj-$(CONFIG_TRACING)			+= trace_clock.o
+
+obj-$(CONFIG_PPC64)			+= $(obj64-y)
+obj-$(CONFIG_PPC32)			+= $(obj32-y)
+
+# Disable GCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_ftrace.o := n
+UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
new file mode 100644
index 000000000..19ef4f586
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -0,0 +1,732 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box.
+ *
+ * Added function graph tracer code, taken from x86 that was written
+ * by Frederic Weisbecker, and ported to PPC by Steven Rostedt.
+ *
+ */
+
+#define pr_fmt(fmt) "ftrace-powerpc: " fmt
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/asm-prototypes.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/ftrace.h>
+#include <asm/syscall.h>
+
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static unsigned int
+ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
+{
+	unsigned int op;
+
+	addr = ppc_function_entry((void *)addr);
+
+	/* if (link) set op to 'bl' else 'b' */
+	op = create_branch((unsigned int *)ip, addr, link ? 1 : 0);
+
+	return op;
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
+{
+	unsigned int replaced;
+
+	/*
+	 * Note:
+	 * We are paranoid about modifying text, as if a bug was to happen, it
+	 * could cause us to read or write to someplace that could cause harm.
+	 * Carefully read and modify the code with probe_kernel_*(), and make
+	 * sure what we read is what we expected it to be before modifying it.
+	 */
+
+	/* read the text we want to modify */
+	if (probe_kernel_read(&replaced, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure it is what we expect it to be */
+	if (replaced != old) {
+		pr_err("%p: replaced (%#x) != old (%#x)",
+		(void *)ip, replaced, old);
+		return -EINVAL;
+	}
+
+	/* replace the text with the new text */
+	if (patch_instruction((unsigned int *)ip, new))
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * Helper functions that are the same for both PPC64 and PPC32.
+ */
+static int test_24bit_addr(unsigned long ip, unsigned long addr)
+{
+	addr = ppc_function_entry((void *)addr);
+
+	/* use the create_branch to verify that this offset can be branched */
+	return create_branch((unsigned int *)ip, addr, 0);
+}
+
+#ifdef CONFIG_MODULES
+
+static int is_bl_op(unsigned int op)
+{
+	return (op & 0xfc000003) == 0x48000001;
+}
+
+static unsigned long find_bl_target(unsigned long ip, unsigned int op)
+{
+	static int offset;
+
+	offset = (op & 0x03fffffc);
+	/* make it signed */
+	if (offset & 0x02000000)
+		offset |= 0xfe000000;
+
+	return ip + (long)offset;
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_nop(struct module *mod,
+		  struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long entry, ptr, tramp;
+	unsigned long ip = rec->ip;
+	unsigned int op, pop;
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
+		pr_err("Fetching opcode failed.\n");
+		return -EFAULT;
+	}
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		pr_err("Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+
+	pr_devel("ip:%lx jumps to %lx", ip, tramp);
+
+	if (module_trampoline_target(mod, tramp, &ptr)) {
+		pr_err("Failed to get trampoline target\n");
+		return -EFAULT;
+	}
+
+	pr_devel("trampoline target %lx", ptr);
+
+	entry = ppc_global_function_entry((void *)addr);
+	/* This should match what was called */
+	if (ptr != entry) {
+		pr_err("addr %lx does not match expected %lx\n", ptr, entry);
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_MPROFILE_KERNEL
+	/* When using -mkernel_profile there is no load to jump over */
+	pop = PPC_INST_NOP;
+
+	if (probe_kernel_read(&op, (void *)(ip - 4), 4)) {
+		pr_err("Fetching instruction at %lx failed.\n", ip - 4);
+		return -EFAULT;
+	}
+
+	/* We expect either a mflr r0, or a std r0, LRSAVE(r1) */
+	if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) {
+		pr_err("Unexpected instruction %08x around bl _mcount\n", op);
+		return -EINVAL;
+	}
+#else
+	/*
+	 * Our original call site looks like:
+	 *
+	 * bl <tramp>
+	 * ld r2,XX(r1)
+	 *
+	 * Milton Miller pointed out that we can not simply nop the branch.
+	 * If a task was preempted when calling a trace function, the nops
+	 * will remove the way to restore the TOC in r2 and the r2 TOC will
+	 * get corrupted.
+	 *
+	 * Use a b +8 to jump over the load.
+	 */
+
+	pop = PPC_INST_BRANCH | 8;	/* b +8 */
+
+	/*
+	 * Check what is in the next instruction. We can see ld r2,40(r1), but
+	 * on first pass after boot we will see mflr r0.
+	 */
+	if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) {
+		pr_err("Fetching op failed.\n");
+		return -EFAULT;
+	}
+
+	if (op != PPC_INST_LD_TOC) {
+		pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op);
+		return -EINVAL;
+	}
+#endif /* CONFIG_MPROFILE_KERNEL */
+
+	if (patch_instruction((unsigned int *)ip, pop)) {
+		pr_err("Patching NOP failed.\n");
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+#else /* !PPC64 */
+static int
+__ftrace_make_nop(struct module *mod,
+		  struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned int jmp[4];
+	unsigned long ip = rec->ip;
+	unsigned long tramp;
+
+	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		pr_err("Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+
+	/*
+	 * On PPC32 the trampoline looks like:
+	 *  0x3d, 0x80, 0x00, 0x00  lis r12,sym@ha
+	 *  0x39, 0x8c, 0x00, 0x00  addi r12,r12,sym@l
+	 *  0x7d, 0x89, 0x03, 0xa6  mtctr r12
+	 *  0x4e, 0x80, 0x04, 0x20  bctr
+	 */
+
+	pr_devel("ip:%lx jumps to %lx", ip, tramp);
+
+	/* Find where the trampoline jumps to */
+	if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+		pr_err("Failed to read %lx\n", tramp);
+		return -EFAULT;
+	}
+
+	pr_devel(" %08x %08x ", jmp[0], jmp[1]);
+
+	/* verify that this is what we expect it to be */
+	if (((jmp[0] & 0xffff0000) != 0x3d800000) ||
+	    ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+	    (jmp[2] != 0x7d8903a6) ||
+	    (jmp[3] != 0x4e800420)) {
+		pr_err("Not a trampoline\n");
+		return -EINVAL;
+	}
+
+	tramp = (jmp[1] & 0xffff) |
+		((jmp[0] & 0xffff) << 16);
+	if (tramp & 0x8000)
+		tramp -= 0x10000;
+
+	pr_devel(" %lx ", tramp);
+
+	if (tramp != addr) {
+		pr_err("Trampoline location %08lx does not match addr\n",
+		       tramp);
+		return -EINVAL;
+	}
+
+	op = PPC_INST_NOP;
+
+	if (patch_instruction((unsigned int *)ip, op))
+		return -EPERM;
+
+	return 0;
+}
+#endif /* PPC64 */
+#endif /* CONFIG_MODULES */
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned int old, new;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		old = ftrace_call_replace(ip, addr, 1);
+		new = PPC_INST_NOP;
+		return ftrace_modify_code(ip, old, new);
+	}
+
+#ifdef CONFIG_MODULES
+	/*
+	 * Out of range jumps are called from modules.
+	 * We should either already have a pointer to the module
+	 * or it has been passed in.
+	 */
+	if (!rec->arch.mod) {
+		if (!mod) {
+			pr_err("No module loaded addr=%lx\n", addr);
+			return -EFAULT;
+		}
+		rec->arch.mod = mod;
+	} else if (mod) {
+		if (mod != rec->arch.mod) {
+			pr_err("Record mod %p not equal to passed in mod %p\n",
+			       rec->arch.mod, mod);
+			return -EINVAL;
+		}
+		/* nothing to do if mod == rec->arch.mod */
+	} else
+		mod = rec->arch.mod;
+
+	return __ftrace_make_nop(mod, rec, addr);
+#else
+	/* We should not get here without modules */
+	return -EINVAL;
+#endif /* CONFIG_MODULES */
+}
+
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_PPC64
+/*
+ * Examine the existing instructions for __ftrace_make_call.
+ * They should effectively be a NOP, and follow formal constraints,
+ * depending on the ABI. Return false if they don't.
+ */
+#ifndef CONFIG_MPROFILE_KERNEL
+static int
+expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
+{
+	/*
+	 * We expect to see:
+	 *
+	 * b +8
+	 * ld r2,XX(r1)
+	 *
+	 * The load offset is different depending on the ABI. For simplicity
+	 * just mask it out when doing the compare.
+	 */
+	if ((op0 != 0x48000008) || ((op1 & 0xffff0000) != 0xe8410000))
+		return 0;
+	return 1;
+}
+#else
+static int
+expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
+{
+	/* look for patched "NOP" on ppc64 with -mprofile-kernel */
+	if (op0 != PPC_INST_NOP)
+		return 0;
+	return 1;
+}
+#endif
+
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op[2];
+	void *ip = (void *)rec->ip;
+	unsigned long entry, ptr, tramp;
+	struct module *mod = rec->arch.mod;
+
+	/* read where this goes */
+	if (probe_kernel_read(op, ip, sizeof(op)))
+		return -EFAULT;
+
+	if (!expected_nop_sequence(ip, op[0], op[1])) {
+		pr_err("Unexpected call sequence at %p: %x %x\n",
+		ip, op[0], op[1]);
+		return -EINVAL;
+	}
+
+	/* If we never set up ftrace trampoline(s), then bail */
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	if (!mod->arch.tramp || !mod->arch.tramp_regs) {
+#else
+	if (!mod->arch.tramp) {
+#endif
+		pr_err("No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	if (rec->flags & FTRACE_FL_REGS)
+		tramp = mod->arch.tramp_regs;
+	else
+#endif
+		tramp = mod->arch.tramp;
+
+	if (module_trampoline_target(mod, tramp, &ptr)) {
+		pr_err("Failed to get trampoline target\n");
+		return -EFAULT;
+	}
+
+	pr_devel("trampoline target %lx", ptr);
+
+	entry = ppc_global_function_entry((void *)addr);
+	/* This should match what was called */
+	if (ptr != entry) {
+		pr_err("addr %lx does not match expected %lx\n", ptr, entry);
+		return -EINVAL;
+	}
+
+	/* Ensure branch is within 24 bits */
+	if (!create_branch(ip, tramp, BRANCH_SET_LINK)) {
+		pr_err("Branch out of range\n");
+		return -EINVAL;
+	}
+
+	if (patch_branch(ip, tramp, BRANCH_SET_LINK)) {
+		pr_err("REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#else  /* !CONFIG_PPC64: */
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned int op;
+	unsigned long ip = rec->ip;
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* It should be pointing to a nop */
+	if (op != PPC_INST_NOP) {
+		pr_err("Expected NOP but have %x\n", op);
+		return -EINVAL;
+	}
+
+	/* If we never set up a trampoline to ftrace_caller, then bail */
+	if (!rec->arch.mod->arch.tramp) {
+		pr_err("No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+	/* create the branch to the trampoline */
+	op = create_branch((unsigned int *)ip,
+			   rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+	if (!op) {
+		pr_err("REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	pr_devel("write to %lx\n", rec->ip);
+
+	if (patch_instruction((unsigned int *)ip, op))
+		return -EPERM;
+
+	return 0;
+}
+#endif /* CONFIG_PPC64 */
+#endif /* CONFIG_MODULES */
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned int old, new;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		old = PPC_INST_NOP;
+		new = ftrace_call_replace(ip, addr, 1);
+		return ftrace_modify_code(ip, old, new);
+	}
+
+#ifdef CONFIG_MODULES
+	/*
+	 * Out of range jumps are called from modules.
+	 * Being that we are converting from nop, it had better
+	 * already have a module defined.
+	 */
+	if (!rec->arch.mod) {
+		pr_err("No module loaded\n");
+		return -EINVAL;
+	}
+
+	return __ftrace_make_call(rec, addr);
+#else
+	/* We should not get here without modules */
+	return -EINVAL;
+#endif /* CONFIG_MODULES */
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_MODULES
+static int
+__ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+					unsigned long addr)
+{
+	unsigned int op;
+	unsigned long ip = rec->ip;
+	unsigned long entry, ptr, tramp;
+	struct module *mod = rec->arch.mod;
+
+	/* If we never set up ftrace trampolines, then bail */
+	if (!mod->arch.tramp || !mod->arch.tramp_regs) {
+		pr_err("No ftrace trampoline\n");
+		return -EINVAL;
+	}
+
+	/* read where this goes */
+	if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
+		pr_err("Fetching opcode failed.\n");
+		return -EFAULT;
+	}
+
+	/* Make sure that that this is still a 24bit jump */
+	if (!is_bl_op(op)) {
+		pr_err("Not expected bl: opcode is %x\n", op);
+		return -EINVAL;
+	}
+
+	/* lets find where the pointer goes */
+	tramp = find_bl_target(ip, op);
+	entry = ppc_global_function_entry((void *)old_addr);
+
+	pr_devel("ip:%lx jumps to %lx", ip, tramp);
+
+	if (tramp != entry) {
+		/* old_addr is not within range, so we must have used a trampoline */
+		if (module_trampoline_target(mod, tramp, &ptr)) {
+			pr_err("Failed to get trampoline target\n");
+			return -EFAULT;
+		}
+
+		pr_devel("trampoline target %lx", ptr);
+
+		/* This should match what was called */
+		if (ptr != entry) {
+			pr_err("addr %lx does not match expected %lx\n", ptr, entry);
+			return -EINVAL;
+		}
+	}
+
+	/* The new target may be within range */
+	if (test_24bit_addr(ip, addr)) {
+		/* within range */
+		if (patch_branch((unsigned int *)ip, addr, BRANCH_SET_LINK)) {
+			pr_err("REL24 out of range!\n");
+			return -EINVAL;
+		}
+
+		return 0;
+	}
+
+	if (rec->flags & FTRACE_FL_REGS)
+		tramp = mod->arch.tramp_regs;
+	else
+		tramp = mod->arch.tramp;
+
+	if (module_trampoline_target(mod, tramp, &ptr)) {
+		pr_err("Failed to get trampoline target\n");
+		return -EFAULT;
+	}
+
+	pr_devel("trampoline target %lx", ptr);
+
+	entry = ppc_global_function_entry((void *)addr);
+	/* This should match what was called */
+	if (ptr != entry) {
+		pr_err("addr %lx does not match expected %lx\n", ptr, entry);
+		return -EINVAL;
+	}
+
+	/* Ensure branch is within 24 bits */
+	if (!create_branch((unsigned int *)ip, tramp, BRANCH_SET_LINK)) {
+		pr_err("Branch out of range\n");
+		return -EINVAL;
+	}
+
+	if (patch_branch((unsigned int *)ip, tramp, BRANCH_SET_LINK)) {
+		pr_err("REL24 out of range!\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+			unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned int old, new;
+
+	/*
+	 * If the calling address is more that 24 bits away,
+	 * then we had to use a trampoline to make the call.
+	 * Otherwise just update the call site.
+	 */
+	if (test_24bit_addr(ip, addr) && test_24bit_addr(ip, old_addr)) {
+		/* within range */
+		old = ftrace_call_replace(ip, old_addr, 1);
+		new = ftrace_call_replace(ip, addr, 1);
+		return ftrace_modify_code(ip, old, new);
+	}
+
+#ifdef CONFIG_MODULES
+	/*
+	 * Out of range jumps are called from modules.
+	 */
+	if (!rec->arch.mod) {
+		pr_err("No module loaded\n");
+		return -EINVAL;
+	}
+
+	return __ftrace_modify_call(rec, old_addr, addr);
+#else
+	/* We should not get here without modules */
+	return -EINVAL;
+#endif /* CONFIG_MODULES */
+}
+#endif
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned int old, new;
+	int ret;
+
+	old = *(unsigned int *)&ftrace_call;
+	new = ftrace_call_replace(ip, (unsigned long)func, 1);
+	ret = ftrace_modify_code(ip, old, new);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	/* Also update the regs callback function */
+	if (!ret) {
+		ip = (unsigned long)(&ftrace_regs_call);
+		old = *(unsigned int *)&ftrace_regs_call;
+		new = ftrace_call_replace(ip, (unsigned long)func, 1);
+		ret = ftrace_modify_code(ip, old, new);
+	}
+#endif
+
+	return ret;
+}
+
+/*
+ * Use the default ftrace_modify_all_code, but without
+ * stop_machine().
+ */
+void arch_ftrace_update_code(int command)
+{
+	ftrace_modify_all_code(command);
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+extern void ftrace_graph_call(void);
+extern void ftrace_graph_stub(void);
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	unsigned long addr = (unsigned long)(&ftrace_graph_caller);
+	unsigned long stub = (unsigned long)(&ftrace_graph_stub);
+	unsigned int old, new;
+
+	old = ftrace_call_replace(ip, stub, 0);
+	new = ftrace_call_replace(ip, addr, 0);
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	unsigned long addr = (unsigned long)(&ftrace_graph_caller);
+	unsigned long stub = (unsigned long)(&ftrace_graph_stub);
+	unsigned int old, new;
+
+	old = ftrace_call_replace(ip, addr, 0);
+	new = ftrace_call_replace(ip, stub, 0);
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info. Return the address we want to divert to.
+ */
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
+{
+	unsigned long return_hooker;
+
+	if (unlikely(ftrace_graph_is_dead()))
+		goto out;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		goto out;
+
+	return_hooker = ppc_function_entry(return_to_handler);
+
+	if (!function_graph_enter(parent, ip, 0, NULL))
+		parent = return_hooker;
+out:
+	return parent;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64)
+unsigned long __init arch_syscall_addr(int nr)
+{
+	return sys_call_table[nr*2];
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
+
+#ifdef PPC64_ELF_ABI_v1
+char *arch_ftrace_match_adjust(char *str, const char *search)
+{
+	if (str[0] == '.' && search[0] != '.')
+		return str + 1;
+	else
+		return str;
+}
+#endif /* PPC64_ELF_ABI_v1 */
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
new file mode 100644
index 000000000..2c29098f6
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -0,0 +1,98 @@
+/*
+ * Split from entry_32.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/export.h>
+
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+	/*
+	 * It is required that _mcount on PPC32 must preserve the
+	 * link register. But we have r0 to play with. We use r0
+	 * to push the return address back to the caller of mcount
+	 * into the ctr register, restore the link register and
+	 * then jump back using the ctr register.
+	 */
+	mflr	r0
+	mtctr	r0
+	lwz	r0, 4(r1)
+	mtlr	r0
+	bctr
+
+_GLOBAL(ftrace_caller)
+	MCOUNT_SAVE_FRAME
+	/* r3 ends up with link register */
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	MCOUNT_RESTORE_FRAME
+	/* old link register ends up in ctr reg */
+	bctr
+
+EXPORT_SYMBOL(_mcount)
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	lwz	r4, 44(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	lwz	r3,52(r1)
+
+	bl	prepare_ftrace_return
+	nop
+
+        /*
+         * prepare_ftrace_return gives us the address we divert to.
+         * Change the LR in the callers stack frame to this.
+         */
+	stw	r3,52(r1)
+
+	MCOUNT_RESTORE_FRAME
+	/* old link register ends up in ctr reg */
+	bctr
+
+_GLOBAL(return_to_handler)
+	/* need to save return values */
+	stwu	r1, -32(r1)
+	stw	r3, 20(r1)
+	stw	r4, 16(r1)
+	stw	r31, 12(r1)
+	mr	r31, r1
+
+	bl	ftrace_return_to_handler
+	nop
+
+	/* return value has real return address */
+	mtlr	r3
+
+	lwz	r3, 20(r1)
+	lwz	r4, 16(r1)
+	lwz	r31,12(r1)
+	lwz	r1, 0(r1)
+
+	/* Jump back to real return address */
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S
new file mode 100644
index 000000000..e25f77c10
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64.S
@@ -0,0 +1,56 @@
+/*
+ * Split from entry_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+EXPORT_SYMBOL(_mcount)
+	mflr	r12
+	mtctr	r12
+	mtlr	r0
+	bctr
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(return_to_handler)
+	/* need to save return values */
+	std	r4,  -32(r1)
+	std	r3,  -24(r1)
+	/* save TOC */
+	std	r2,  -16(r1)
+	std	r31, -8(r1)
+	mr	r31, r1
+	stdu	r1, -112(r1)
+
+	/*
+	 * We might be called from a module.
+	 * Switch to our TOC to run inside the core kernel.
+	 */
+	ld	r2, PACATOC(r13)
+
+	bl	ftrace_return_to_handler
+	nop
+
+	/* return value has real return address */
+	mtlr	r3
+
+	ld	r1, 0(r1)
+	ld	r4,  -32(r1)
+	ld	r3,  -24(r1)
+	ld	r2,  -16(r1)
+	ld	r31, -8(r1)
+
+	/* Jump back to real return address */
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
new file mode 100644
index 000000000..32476a6e4
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -0,0 +1,333 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+#include <asm/thread_info.h>
+#include <asm/bug.h>
+#include <asm/ptrace.h>
+
+/*
+ *
+ * ftrace_caller()/ftrace_regs_caller() is the function that replaces _mcount()
+ * when ftrace is active.
+ *
+ * We arrive here after a function A calls function B, and we are the trace
+ * function for B. When we enter r1 points to A's stack frame, B has not yet
+ * had a chance to allocate one yet.
+ *
+ * Additionally r2 may point either to the TOC for A, or B, depending on
+ * whether B did a TOC setup sequence before calling us.
+ *
+ * On entry the LR points back to the _mcount() call site, and r0 holds the
+ * saved LR as it was on entry to B, ie. the original return address at the
+ * call site in A.
+ *
+ * Our job is to save the register state into a struct pt_regs (on the stack)
+ * and then arrange for the ftrace function to be called.
+ */
+_GLOBAL(ftrace_regs_caller)
+	/* Save the original return address in A's stack frame */
+	std	r0,LRSAVE(r1)
+
+	/* Create our stack frame + pt_regs */
+	stdu	r1,-SWITCH_FRAME_SIZE(r1)
+
+	/* Save all gprs to pt_regs */
+	SAVE_GPR(0, r1)
+	SAVE_10GPRS(2, r1)
+
+	/* Ok to continue? */
+	lbz	r3, PACA_FTRACE_ENABLED(r13)
+	cmpdi	r3, 0
+	beq	ftrace_no_trace
+
+	SAVE_10GPRS(12, r1)
+	SAVE_10GPRS(22, r1)
+
+	/* Save previous stack pointer (r1) */
+	addi	r8, r1, SWITCH_FRAME_SIZE
+	std	r8, GPR1(r1)
+
+	/* Load special regs for save below */
+	mfmsr   r8
+	mfctr   r9
+	mfxer   r10
+	mfcr	r11
+
+	/* Get the _mcount() call site out of LR */
+	mflr	r7
+	/* Save it as pt_regs->nip */
+	std     r7, _NIP(r1)
+	/* Save the read LR in pt_regs->link */
+	std     r0, _LINK(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
+
+	addis	r3,r2,function_trace_op@toc@ha
+	addi	r3,r3,function_trace_op@toc@l
+	ld	r5,0(r3)
+
+#ifdef CONFIG_LIVEPATCH
+	mr	r14,r7		/* remember old NIP */
+#endif
+	/* Calculate ip from nip-4 into r3 for call below */
+	subi    r3, r7, MCOUNT_INSN_SIZE
+
+	/* Put the original return address in r4 as parent_ip */
+	mr	r4, r0
+
+	/* Save special regs */
+	std     r8, _MSR(r1)
+	std     r9, _CTR(r1)
+	std     r10, _XER(r1)
+	std     r11, _CCR(r1)
+
+	/* Load &pt_regs in r6 for call below */
+	addi    r6, r1 ,STACK_FRAME_OVERHEAD
+
+	/* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_regs_call
+ftrace_regs_call:
+	bl	ftrace_stub
+	nop
+
+	/* Load ctr with the possibly modified NIP */
+	ld	r3, _NIP(r1)
+	mtctr	r3
+#ifdef CONFIG_LIVEPATCH
+	cmpd	r14, r3		/* has NIP been altered? */
+#endif
+
+	/* Restore gprs */
+	REST_GPR(0,r1)
+	REST_10GPRS(2,r1)
+	REST_10GPRS(12,r1)
+	REST_10GPRS(22,r1)
+
+	/* Restore possibly modified LR */
+	ld	r0, _LINK(r1)
+	mtlr	r0
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	/* Pop our stack frame */
+	addi r1, r1, SWITCH_FRAME_SIZE
+
+#ifdef CONFIG_LIVEPATCH
+        /* Based on the cmpd above, if the NIP was altered handle livepatch */
+	bne-	livepatch_handler
+#endif
+
+ftrace_caller_common:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+
+	bctr			/* jump after _mcount site */
+
+_GLOBAL(ftrace_stub)
+	blr
+
+ftrace_no_trace:
+	mflr	r3
+	mtctr	r3
+	REST_GPR(3, r1)
+	addi	r1, r1, SWITCH_FRAME_SIZE
+	mtlr	r0
+	bctr
+
+_GLOBAL(ftrace_caller)
+	/* Save the original return address in A's stack frame */
+	std	r0, LRSAVE(r1)
+
+	/* Create our stack frame + pt_regs */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save all gprs to pt_regs */
+	SAVE_8GPRS(3, r1)
+
+	lbz	r3, PACA_FTRACE_ENABLED(r13)
+	cmpdi	r3, 0
+	beq	ftrace_no_trace
+
+	/* Get the _mcount() call site out of LR */
+	mflr	r7
+	std     r7, _NIP(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
+
+	addis	r3, r2, function_trace_op@toc@ha
+	addi	r3, r3, function_trace_op@toc@l
+	ld	r5, 0(r3)
+
+	/* Calculate ip from nip-4 into r3 for call below */
+	subi    r3, r7, MCOUNT_INSN_SIZE
+
+	/* Put the original return address in r4 as parent_ip */
+	mr	r4, r0
+
+	/* Set pt_regs to NULL */
+	li	r6, 0
+
+	/* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+
+	ld	r3, _NIP(r1)
+	mtctr	r3
+
+	/* Restore gprs */
+	REST_8GPRS(3,r1)
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	/* Pop our stack frame */
+	addi	r1, r1, SWITCH_FRAME_SIZE
+
+	/* Reload original LR */
+	ld	r0, LRSAVE(r1)
+	mtlr	r0
+
+	/* Handle function_graph or go back */
+	b	ftrace_caller_common
+
+#ifdef CONFIG_LIVEPATCH
+	/*
+	 * This function runs in the mcount context, between two functions. As
+	 * such it can only clobber registers which are volatile and used in
+	 * function linkage.
+	 *
+	 * We get here when a function A, calls another function B, but B has
+	 * been live patched with a new function C.
+	 *
+	 * On entry:
+	 *  - we have no stack frame and can not allocate one
+	 *  - LR points back to the original caller (in A)
+	 *  - CTR holds the new NIP in C
+	 *  - r0, r11 & r12 are free
+	 */
+livepatch_handler:
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Allocate 3 x 8 bytes */
+	ld	r11, TI_livepatch_sp(r12)
+	addi	r11, r11, 24
+	std	r11, TI_livepatch_sp(r12)
+
+	/* Save toc & real LR on livepatch stack */
+	std	r2,  -24(r11)
+	mflr	r12
+	std	r12, -16(r11)
+
+	/* Store stack end marker */
+	lis     r12, STACK_END_MAGIC@h
+	ori     r12, r12, STACK_END_MAGIC@l
+	std	r12, -8(r11)
+
+	/* Put ctr in r12 for global entry and branch there */
+	mfctr	r12
+	bctrl
+
+	/*
+	 * Now we are returning from the patched function to the original
+	 * caller A. We are free to use r11, r12 and we can use r2 until we
+	 * restore it.
+	 */
+
+	CURRENT_THREAD_INFO(r12, r1)
+
+	ld	r11, TI_livepatch_sp(r12)
+
+	/* Check stack marker hasn't been trashed */
+	lis     r2,  STACK_END_MAGIC@h
+	ori     r2,  r2, STACK_END_MAGIC@l
+	ld	r12, -8(r11)
+1:	tdne	r12, r2
+	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
+
+	/* Restore LR & toc from livepatch stack */
+	ld	r12, -16(r11)
+	mtlr	r12
+	ld	r2,  -24(r11)
+
+	/* Pop livepatch stack frame */
+	CURRENT_THREAD_INFO(r12, r1)
+	subi	r11, r11, 24
+	std	r11, TI_livepatch_sp(r12)
+
+	/* Return to original caller of live patched function */
+	blr
+#endif /* CONFIG_LIVEPATCH */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	stdu	r1, -112(r1)
+	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
+	std	r10, 104(r1)
+	std	r9, 96(r1)
+	std	r8, 88(r1)
+	std	r7, 80(r1)
+	std	r6, 72(r1)
+	std	r5, 64(r1)
+	std	r4, 56(r1)
+	std	r3, 48(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
+
+	mfctr	r4		/* ftrace_caller has moved local addr here */
+	std	r4, 40(r1)
+	mflr	r3		/* ftrace_caller has restored LR from stack */
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR to this.
+	 */
+	mtlr	r3
+
+	ld	r0, 40(r1)
+	mtctr	r0
+	ld	r10, 104(r1)
+	ld	r9, 96(r1)
+	ld	r8, 88(r1)
+	ld	r7, 80(r1)
+	ld	r6, 72(r1)
+	ld	r5, 64(r1)
+	ld	r4, 56(r1)
+	ld	r3, 48(r1)
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	addi	r1, r1, 112
+	mflr	r0
+	std	r0, LRSAVE(r1)
+	bctr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
new file mode 100644
index 000000000..4c515c402
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -0,0 +1,70 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+_GLOBAL_TOC(ftrace_caller)
+	lbz	r3, PACA_FTRACE_ENABLED(r13)
+	cmpdi	r3, 0
+	beqlr
+
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	ld	r4, 128(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	ld	r11, 112(r1)
+	ld	r3, 16(r11)
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR in the callers stack frame to this.
+	 */
+	ld	r11, 112(r1)
+	std	r3, 16(r11)
+
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/trace_clock.c b/arch/powerpc/kernel/trace/trace_clock.c
new file mode 100644
index 000000000..491706909
--- /dev/null
+++ b/arch/powerpc/kernel/trace/trace_clock.c
@@ -0,0 +1,15 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
+ */
+
+#include <asm/trace_clock.h>
+#include <asm/time.h>
+
+u64 notrace trace_clock_ppc_tb(void)
+{
+	return get_tb();
+}
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
new file mode 100644
index 000000000..2379c4bf3
--- /dev/null
+++ b/arch/powerpc/kernel/traps.c
@@ -0,0 +1,2220 @@
+/*
+ *  Copyright (C) 1995-1996  Gary Thomas (gdt@linuxppc.org)
+ *  Copyright 2007-2010 Freescale Semiconductor, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ *  Modified by Cort Dougan (cort@cs.nmt.edu)
+ *  and Paul Mackerras (paulus@samba.org)
+ */
+
+/*
+ * This file handles the architecture-dependent parts of hardware exceptions
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pkeys.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/extable.h>
+#include <linux/module.h>	/* print_modules */
+#include <linux/prctl.h>
+#include <linux/delay.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/backlight.h>
+#include <linux/bug.h>
+#include <linux/kdebug.h>
+#include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
+#include <linux/smp.h>
+#include <linux/console.h>
+#include <linux/kmsg_dump.h>
+
+#include <asm/emulated_ops.h>
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+#include <asm/debugfs.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/pmc.h>
+#include <asm/reg.h>
+#ifdef CONFIG_PMAC_BACKLIGHT
+#include <asm/backlight.h>
+#endif
+#ifdef CONFIG_PPC64
+#include <asm/firmware.h>
+#include <asm/processor.h>
+#include <asm/tm.h>
+#endif
+#include <asm/kexec.h>
+#include <asm/ppc-opcode.h>
+#include <asm/rio.h>
+#include <asm/fadump.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/debug.h>
+#include <asm/asm-prototypes.h>
+#include <asm/hmi.h>
+#include <sysdev/fsl_pci.h>
+#include <asm/kprobes.h>
+#include <asm/stacktrace.h>
+
+#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
+int (*__debugger)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_ipi)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_bpt)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_sstep)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_iabr_match)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_break_match)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_fault_handler)(struct pt_regs *regs) __read_mostly;
+
+EXPORT_SYMBOL(__debugger);
+EXPORT_SYMBOL(__debugger_ipi);
+EXPORT_SYMBOL(__debugger_bpt);
+EXPORT_SYMBOL(__debugger_sstep);
+EXPORT_SYMBOL(__debugger_iabr_match);
+EXPORT_SYMBOL(__debugger_break_match);
+EXPORT_SYMBOL(__debugger_fault_handler);
+#endif
+
+/* Transactional Memory trap debug */
+#ifdef TM_DEBUG_SW
+#define TM_DEBUG(x...) printk(KERN_INFO x)
+#else
+#define TM_DEBUG(x...) do { } while(0)
+#endif
+
+static const char *signame(int signr)
+{
+	switch (signr) {
+	case SIGBUS:	return "bus error";
+	case SIGFPE:	return "floating point exception";
+	case SIGILL:	return "illegal instruction";
+	case SIGSEGV:	return "segfault";
+	case SIGTRAP:	return "unhandled trap";
+	}
+
+	return "unknown signal";
+}
+
+/*
+ * Trap & Exception support
+ */
+
+#ifdef CONFIG_PMAC_BACKLIGHT
+static void pmac_backlight_unblank(void)
+{
+	mutex_lock(&pmac_backlight_mutex);
+	if (pmac_backlight) {
+		struct backlight_properties *props;
+
+		props = &pmac_backlight->props;
+		props->brightness = props->max_brightness;
+		props->power = FB_BLANK_UNBLANK;
+		backlight_update_status(pmac_backlight);
+	}
+	mutex_unlock(&pmac_backlight_mutex);
+}
+#else
+static inline void pmac_backlight_unblank(void) { }
+#endif
+
+/*
+ * If oops/die is expected to crash the machine, return true here.
+ *
+ * This should not be expected to be 100% accurate, there may be
+ * notifiers registered or other unexpected conditions that may bring
+ * down the kernel. Or if the current process in the kernel is holding
+ * locks or has other critical state, the kernel may become effectively
+ * unusable anyway.
+ */
+bool die_will_crash(void)
+{
+	if (should_fadump_crash())
+		return true;
+	if (kexec_should_crash(current))
+		return true;
+	if (in_interrupt() || panic_on_oops ||
+			!current->pid || is_global_init(current))
+		return true;
+
+	return false;
+}
+
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+static int die_counter;
+
+extern void panic_flush_kmsg_start(void)
+{
+	/*
+	 * These are mostly taken from kernel/panic.c, but tries to do
+	 * relatively minimal work. Don't use delay functions (TB may
+	 * be broken), don't crash dump (need to set a firmware log),
+	 * don't run notifiers. We do want to get some information to
+	 * Linux console.
+	 */
+	console_verbose();
+	bust_spinlocks(1);
+}
+
+extern void panic_flush_kmsg_end(void)
+{
+	printk_safe_flush_on_panic();
+	kmsg_dump(KMSG_DUMP_PANIC);
+	bust_spinlocks(0);
+	debug_locks_off();
+	console_flush_on_panic();
+}
+
+static unsigned long oops_begin(struct pt_regs *regs)
+{
+	int cpu;
+	unsigned long flags;
+
+	oops_enter();
+
+	/* racy, but better than risking deadlock. */
+	raw_local_irq_save(flags);
+	cpu = smp_processor_id();
+	if (!arch_spin_trylock(&die_lock)) {
+		if (cpu == die_owner)
+			/* nested oops. should stop eventually */;
+		else
+			arch_spin_lock(&die_lock);
+	}
+	die_nest_count++;
+	die_owner = cpu;
+	console_verbose();
+	bust_spinlocks(1);
+	if (machine_is(powermac))
+		pmac_backlight_unblank();
+	return flags;
+}
+NOKPROBE_SYMBOL(oops_begin);
+
+static void oops_end(unsigned long flags, struct pt_regs *regs,
+			       int signr)
+{
+	bust_spinlocks(0);
+	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+	die_nest_count--;
+	oops_exit();
+	printk("\n");
+	if (!die_nest_count) {
+		/* Nest count reaches zero, release the lock. */
+		die_owner = -1;
+		arch_spin_unlock(&die_lock);
+	}
+	raw_local_irq_restore(flags);
+
+	/*
+	 * system_reset_excption handles debugger, crash dump, panic, for 0x100
+	 */
+	if (TRAP(regs) == 0x100)
+		return;
+
+	crash_fadump(regs, "die oops");
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
+	if (!signr)
+		return;
+
+	/*
+	 * While our oops output is serialised by a spinlock, output
+	 * from panic() called below can race and corrupt it. If we
+	 * know we are going to panic, delay for 1 second so we have a
+	 * chance to get clean backtraces from all CPUs that are oopsing.
+	 */
+	if (in_interrupt() || panic_on_oops || !current->pid ||
+	    is_global_init(current)) {
+		mdelay(MSEC_PER_SEC);
+	}
+
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+	if (panic_on_oops)
+		panic("Fatal exception");
+	do_exit(signr);
+}
+NOKPROBE_SYMBOL(oops_end);
+
+static int __die(const char *str, struct pt_regs *regs, long err)
+{
+	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
+
+	if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
+		printk("LE ");
+	else
+		printk("BE ");
+
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		pr_cont("PREEMPT ");
+
+	if (IS_ENABLED(CONFIG_SMP))
+		pr_cont("SMP NR_CPUS=%d ", NR_CPUS);
+
+	if (debug_pagealloc_enabled())
+		pr_cont("DEBUG_PAGEALLOC ");
+
+	if (IS_ENABLED(CONFIG_NUMA))
+		pr_cont("NUMA ");
+
+	pr_cont("%s\n", ppc_md.name ? ppc_md.name : "");
+
+	if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP)
+		return 1;
+
+	print_modules();
+	show_regs(regs);
+
+	return 0;
+}
+NOKPROBE_SYMBOL(__die);
+
+void die(const char *str, struct pt_regs *regs, long err)
+{
+	unsigned long flags;
+
+	/*
+	 * system_reset_excption handles debugger, crash dump, panic, for 0x100
+	 */
+	if (TRAP(regs) != 0x100) {
+		if (debugger(regs))
+			return;
+	}
+
+	flags = oops_begin(regs);
+	if (__die(str, regs, err))
+		err = 0;
+	oops_end(flags, regs, err);
+}
+NOKPROBE_SYMBOL(die);
+
+void user_single_step_siginfo(struct task_struct *tsk,
+				struct pt_regs *regs, siginfo_t *info)
+{
+	info->si_signo = SIGTRAP;
+	info->si_code = TRAP_TRACE;
+	info->si_addr = (void __user *)regs->nip;
+}
+
+static void show_signal_msg(int signr, struct pt_regs *regs, int code,
+			    unsigned long addr)
+{
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	if (!show_unhandled_signals)
+		return;
+
+	if (!unhandled_signal(current, signr))
+		return;
+
+	if (!__ratelimit(&rs))
+		return;
+
+	pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
+		current->comm, current->pid, signame(signr), signr,
+		addr, regs->nip, regs->link, code);
+
+	print_vma_addr(KERN_CONT " in ", regs->nip);
+
+	pr_cont("\n");
+
+	show_user_instructions(regs);
+}
+
+void _exception_pkey(int signr, struct pt_regs *regs, int code,
+		     unsigned long addr, int key)
+{
+	siginfo_t info;
+
+	if (!user_mode(regs)) {
+		die("Exception in kernel mode", regs, signr);
+		return;
+	}
+
+	show_signal_msg(signr, regs, code, addr);
+
+	if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs))
+		local_irq_enable();
+
+	current->thread.trap_nr = code;
+
+	/*
+	 * Save all the pkey registers AMR/IAMR/UAMOR. Eg: Core dumps need
+	 * to capture the content, if the task gets killed.
+	 */
+	thread_pkey_regs_save(&current->thread);
+
+	clear_siginfo(&info);
+	info.si_signo = signr;
+	info.si_code = code;
+	info.si_addr = (void __user *) addr;
+	info.si_pkey = key;
+
+	force_sig_info(signr, &info, current);
+}
+
+void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
+{
+	_exception_pkey(signr, regs, code, addr, 0);
+}
+
+void system_reset_exception(struct pt_regs *regs)
+{
+	/*
+	 * Avoid crashes in case of nested NMI exceptions. Recoverability
+	 * is determined by RI and in_nmi
+	 */
+	bool nested = in_nmi();
+	if (!nested)
+		nmi_enter();
+
+	__this_cpu_inc(irq_stat.sreset_irqs);
+
+	/* See if any machine dependent calls */
+	if (ppc_md.system_reset_exception) {
+		if (ppc_md.system_reset_exception(regs))
+			goto out;
+	}
+
+	if (debugger(regs))
+		goto out;
+
+	kmsg_dump(KMSG_DUMP_OOPS);
+	/*
+	 * A system reset is a request to dump, so we always send
+	 * it through the crashdump code (if fadump or kdump are
+	 * registered).
+	 */
+	crash_fadump(regs, "System Reset");
+
+	crash_kexec(regs);
+
+	/*
+	 * We aren't the primary crash CPU. We need to send it
+	 * to a holding pattern to avoid it ending up in the panic
+	 * code.
+	 */
+	crash_kexec_secondary(regs);
+
+	/*
+	 * No debugger or crash dump registered, print logs then
+	 * panic.
+	 */
+	die("System Reset", regs, SIGABRT);
+
+	mdelay(2*MSEC_PER_SEC); /* Wait a little while for others to print */
+	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+	nmi_panic(regs, "System Reset");
+
+out:
+#ifdef CONFIG_PPC_BOOK3S_64
+	BUG_ON(get_paca()->in_nmi == 0);
+	if (get_paca()->in_nmi > 1)
+		die("Unrecoverable nested System Reset", regs, SIGABRT);
+#endif
+	/* Must die if the interrupt is not recoverable */
+	if (!(regs->msr & MSR_RI)) {
+		/* For the reason explained in die_mce, nmi_exit before die */
+		nmi_exit();
+		die("Unrecoverable System Reset", regs, SIGABRT);
+	}
+
+	if (!nested)
+		nmi_exit();
+
+	/* What should we do here? We could issue a shutdown or hard reset. */
+}
+
+/*
+ * I/O accesses can cause machine checks on powermacs.
+ * Check if the NIP corresponds to the address of a sync
+ * instruction for which there is an entry in the exception
+ * table.
+ * Note that the 601 only takes a machine check on TEA
+ * (transfer error ack) signal assertion, and does not
+ * set any of the top 16 bits of SRR1.
+ *  -- paulus.
+ */
+static inline int check_io_access(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC32
+	unsigned long msr = regs->msr;
+	const struct exception_table_entry *entry;
+	unsigned int *nip = (unsigned int *)regs->nip;
+
+	if (((msr & 0xffff0000) == 0 || (msr & (0x80000 | 0x40000)))
+	    && (entry = search_exception_tables(regs->nip)) != NULL) {
+		/*
+		 * Check that it's a sync instruction, or somewhere
+		 * in the twi; isync; nop sequence that inb/inw/inl uses.
+		 * As the address is in the exception table
+		 * we should be able to read the instr there.
+		 * For the debug message, we look at the preceding
+		 * load or store.
+		 */
+		if (*nip == PPC_INST_NOP)
+			nip -= 2;
+		else if (*nip == PPC_INST_ISYNC)
+			--nip;
+		if (*nip == PPC_INST_SYNC || (*nip >> 26) == OP_TRAP) {
+			unsigned int rb;
+
+			--nip;
+			rb = (*nip >> 11) & 0x1f;
+			printk(KERN_DEBUG "%s bad port %lx at %p\n",
+			       (*nip & 0x100)? "OUT to": "IN from",
+			       regs->gpr[rb] - _IO_BASE, nip);
+			regs->msr |= MSR_RI;
+			regs->nip = extable_fixup(entry);
+			return 1;
+		}
+	}
+#endif /* CONFIG_PPC32 */
+	return 0;
+}
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+/* On 4xx, the reason for the machine check or program exception
+   is in the ESR. */
+#define get_reason(regs)	((regs)->dsisr)
+#define REASON_FP		ESR_FP
+#define REASON_ILLEGAL		(ESR_PIL | ESR_PUO)
+#define REASON_PRIVILEGED	ESR_PPR
+#define REASON_TRAP		ESR_PTR
+
+/* single-step stuff */
+#define single_stepping(regs)	(current->thread.debug.dbcr0 & DBCR0_IC)
+#define clear_single_step(regs)	(current->thread.debug.dbcr0 &= ~DBCR0_IC)
+#define clear_br_trace(regs)	do {} while(0)
+#else
+/* On non-4xx, the reason for the machine check or program
+   exception is in the MSR. */
+#define get_reason(regs)	((regs)->msr)
+#define REASON_TM		SRR1_PROGTM
+#define REASON_FP		SRR1_PROGFPE
+#define REASON_ILLEGAL		SRR1_PROGILL
+#define REASON_PRIVILEGED	SRR1_PROGPRIV
+#define REASON_TRAP		SRR1_PROGTRAP
+
+#define single_stepping(regs)	((regs)->msr & MSR_SE)
+#define clear_single_step(regs)	((regs)->msr &= ~MSR_SE)
+#define clear_br_trace(regs)	((regs)->msr &= ~MSR_BE)
+#endif
+
+#if defined(CONFIG_E500)
+int machine_check_e500mc(struct pt_regs *regs)
+{
+	unsigned long mcsr = mfspr(SPRN_MCSR);
+	unsigned long pvr = mfspr(SPRN_PVR);
+	unsigned long reason = mcsr;
+	int recoverable = 1;
+
+	if (reason & MCSR_LD) {
+		recoverable = fsl_rio_mcheck_exception(regs);
+		if (recoverable == 1)
+			goto silent_out;
+	}
+
+	printk("Machine check in kernel mode.\n");
+	printk("Caused by (from MCSR=%lx): ", reason);
+
+	if (reason & MCSR_MCP)
+		printk("Machine Check Signal\n");
+
+	if (reason & MCSR_ICPERR) {
+		printk("Instruction Cache Parity Error\n");
+
+		/*
+		 * This is recoverable by invalidating the i-cache.
+		 */
+		mtspr(SPRN_L1CSR1, mfspr(SPRN_L1CSR1) | L1CSR1_ICFI);
+		while (mfspr(SPRN_L1CSR1) & L1CSR1_ICFI)
+			;
+
+		/*
+		 * This will generally be accompanied by an instruction
+		 * fetch error report -- only treat MCSR_IF as fatal
+		 * if it wasn't due to an L1 parity error.
+		 */
+		reason &= ~MCSR_IF;
+	}
+
+	if (reason & MCSR_DCPERR_MC) {
+		printk("Data Cache Parity Error\n");
+
+		/*
+		 * In write shadow mode we auto-recover from the error, but it
+		 * may still get logged and cause a machine check.  We should
+		 * only treat the non-write shadow case as non-recoverable.
+		 */
+		/* On e6500 core, L1 DCWS (Data cache write shadow mode) bit
+		 * is not implemented but L1 data cache always runs in write
+		 * shadow mode. Hence on data cache parity errors HW will
+		 * automatically invalidate the L1 Data Cache.
+		 */
+		if (PVR_VER(pvr) != PVR_VER_E6500) {
+			if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS))
+				recoverable = 0;
+		}
+	}
+
+	if (reason & MCSR_L2MMU_MHIT) {
+		printk("Hit on multiple TLB entries\n");
+		recoverable = 0;
+	}
+
+	if (reason & MCSR_NMI)
+		printk("Non-maskable interrupt\n");
+
+	if (reason & MCSR_IF) {
+		printk("Instruction Fetch Error Report\n");
+		recoverable = 0;
+	}
+
+	if (reason & MCSR_LD) {
+		printk("Load Error Report\n");
+		recoverable = 0;
+	}
+
+	if (reason & MCSR_ST) {
+		printk("Store Error Report\n");
+		recoverable = 0;
+	}
+
+	if (reason & MCSR_LDG) {
+		printk("Guarded Load Error Report\n");
+		recoverable = 0;
+	}
+
+	if (reason & MCSR_TLBSYNC)
+		printk("Simultaneous tlbsync operations\n");
+
+	if (reason & MCSR_BSL2_ERR) {
+		printk("Level 2 Cache Error\n");
+		recoverable = 0;
+	}
+
+	if (reason & MCSR_MAV) {
+		u64 addr;
+
+		addr = mfspr(SPRN_MCAR);
+		addr |= (u64)mfspr(SPRN_MCARU) << 32;
+
+		printk("Machine Check %s Address: %#llx\n",
+		       reason & MCSR_MEA ? "Effective" : "Physical", addr);
+	}
+
+silent_out:
+	mtspr(SPRN_MCSR, mcsr);
+	return mfspr(SPRN_MCSR) == 0 && recoverable;
+}
+
+int machine_check_e500(struct pt_regs *regs)
+{
+	unsigned long reason = mfspr(SPRN_MCSR);
+
+	if (reason & MCSR_BUS_RBERR) {
+		if (fsl_rio_mcheck_exception(regs))
+			return 1;
+		if (fsl_pci_mcheck_exception(regs))
+			return 1;
+	}
+
+	printk("Machine check in kernel mode.\n");
+	printk("Caused by (from MCSR=%lx): ", reason);
+
+	if (reason & MCSR_MCP)
+		printk("Machine Check Signal\n");
+	if (reason & MCSR_ICPERR)
+		printk("Instruction Cache Parity Error\n");
+	if (reason & MCSR_DCP_PERR)
+		printk("Data Cache Push Parity Error\n");
+	if (reason & MCSR_DCPERR)
+		printk("Data Cache Parity Error\n");
+	if (reason & MCSR_BUS_IAERR)
+		printk("Bus - Instruction Address Error\n");
+	if (reason & MCSR_BUS_RAERR)
+		printk("Bus - Read Address Error\n");
+	if (reason & MCSR_BUS_WAERR)
+		printk("Bus - Write Address Error\n");
+	if (reason & MCSR_BUS_IBERR)
+		printk("Bus - Instruction Data Error\n");
+	if (reason & MCSR_BUS_RBERR)
+		printk("Bus - Read Data Bus Error\n");
+	if (reason & MCSR_BUS_WBERR)
+		printk("Bus - Write Data Bus Error\n");
+	if (reason & MCSR_BUS_IPERR)
+		printk("Bus - Instruction Parity Error\n");
+	if (reason & MCSR_BUS_RPERR)
+		printk("Bus - Read Parity Error\n");
+
+	return 0;
+}
+
+int machine_check_generic(struct pt_regs *regs)
+{
+	return 0;
+}
+#elif defined(CONFIG_E200)
+int machine_check_e200(struct pt_regs *regs)
+{
+	unsigned long reason = mfspr(SPRN_MCSR);
+
+	printk("Machine check in kernel mode.\n");
+	printk("Caused by (from MCSR=%lx): ", reason);
+
+	if (reason & MCSR_MCP)
+		printk("Machine Check Signal\n");
+	if (reason & MCSR_CP_PERR)
+		printk("Cache Push Parity Error\n");
+	if (reason & MCSR_CPERR)
+		printk("Cache Parity Error\n");
+	if (reason & MCSR_EXCP_ERR)
+		printk("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n");
+	if (reason & MCSR_BUS_IRERR)
+		printk("Bus - Read Bus Error on instruction fetch\n");
+	if (reason & MCSR_BUS_DRERR)
+		printk("Bus - Read Bus Error on data load\n");
+	if (reason & MCSR_BUS_WRERR)
+		printk("Bus - Write Bus Error on buffered store or cache line push\n");
+
+	return 0;
+}
+#elif defined(CONFIG_PPC32)
+int machine_check_generic(struct pt_regs *regs)
+{
+	unsigned long reason = regs->msr;
+
+	printk("Machine check in kernel mode.\n");
+	printk("Caused by (from SRR1=%lx): ", reason);
+	switch (reason & 0x601F0000) {
+	case 0x80000:
+		printk("Machine check signal\n");
+		break;
+	case 0:		/* for 601 */
+	case 0x40000:
+	case 0x140000:	/* 7450 MSS error and TEA */
+		printk("Transfer error ack signal\n");
+		break;
+	case 0x20000:
+		printk("Data parity error signal\n");
+		break;
+	case 0x10000:
+		printk("Address parity error signal\n");
+		break;
+	case 0x20000000:
+		printk("L1 Data Cache error\n");
+		break;
+	case 0x40000000:
+		printk("L1 Instruction Cache error\n");
+		break;
+	case 0x00100000:
+		printk("L2 data cache parity error\n");
+		break;
+	default:
+		printk("Unknown values in msr\n");
+	}
+	return 0;
+}
+#endif /* everything else */
+
+void machine_check_exception(struct pt_regs *regs)
+{
+	int recover = 0;
+	bool nested = in_nmi();
+	if (!nested)
+		nmi_enter();
+
+	/* 64s accounts the mce in machine_check_early when in HVMODE */
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE))
+		__this_cpu_inc(irq_stat.mce_exceptions);
+
+	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+	/* See if any machine dependent calls. In theory, we would want
+	 * to call the CPU first, and call the ppc_md. one if the CPU
+	 * one returns a positive number. However there is existing code
+	 * that assumes the board gets a first chance, so let's keep it
+	 * that way for now and fix things later. --BenH.
+	 */
+	if (ppc_md.machine_check_exception)
+		recover = ppc_md.machine_check_exception(regs);
+	else if (cur_cpu_spec->machine_check)
+		recover = cur_cpu_spec->machine_check(regs);
+
+	if (recover > 0)
+		goto bail;
+
+	if (debugger_fault_handler(regs))
+		goto bail;
+
+	if (check_io_access(regs))
+		goto bail;
+
+	if (!nested)
+		nmi_exit();
+
+	die("Machine check", regs, SIGBUS);
+
+	/* Must die if the interrupt is not recoverable */
+	if (!(regs->msr & MSR_RI))
+		die("Unrecoverable Machine check", regs, SIGBUS);
+
+	return;
+
+bail:
+	if (!nested)
+		nmi_exit();
+}
+
+void SMIException(struct pt_regs *regs)
+{
+	die("System Management Interrupt", regs, SIGABRT);
+}
+
+#ifdef CONFIG_VSX
+static void p9_hmi_special_emu(struct pt_regs *regs)
+{
+	unsigned int ra, rb, t, i, sel, instr, rc;
+	const void __user *addr;
+	u8 vbuf[16] __aligned(16), *vdst;
+	unsigned long ea, msr, msr_mask;
+	bool swap;
+
+	if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip))
+		return;
+
+	/*
+	 * lxvb16x	opcode: 0x7c0006d8
+	 * lxvd2x	opcode: 0x7c000698
+	 * lxvh8x	opcode: 0x7c000658
+	 * lxvw4x	opcode: 0x7c000618
+	 */
+	if ((instr & 0xfc00073e) != 0x7c000618) {
+		pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx"
+			 " instr=%08x\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr);
+		return;
+	}
+
+	/* Grab vector registers into the task struct */
+	msr = regs->msr; /* Grab msr before we flush the bits */
+	flush_vsx_to_thread(current);
+	enable_kernel_altivec();
+
+	/*
+	 * Is userspace running with a different endian (this is rare but
+	 * not impossible)
+	 */
+	swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
+
+	/* Decode the instruction */
+	ra = (instr >> 16) & 0x1f;
+	rb = (instr >> 11) & 0x1f;
+	t = (instr >> 21) & 0x1f;
+	if (instr & 1)
+		vdst = (u8 *)&current->thread.vr_state.vr[t];
+	else
+		vdst = (u8 *)&current->thread.fp_state.fpr[t][0];
+
+	/* Grab the vector address */
+	ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0);
+	if (is_32bit_task())
+		ea &= 0xfffffffful;
+	addr = (__force const void __user *)ea;
+
+	/* Check it */
+	if (!access_ok(VERIFY_READ, addr, 16)) {
+		pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx"
+			 " instr=%08x addr=%016lx\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr, (unsigned long)addr);
+		return;
+	}
+
+	/* Read the vector */
+	rc = 0;
+	if ((unsigned long)addr & 0xfUL)
+		/* unaligned case */
+		rc = __copy_from_user_inatomic(vbuf, addr, 16);
+	else
+		__get_user_atomic_128_aligned(vbuf, addr, rc);
+	if (rc) {
+		pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx"
+			 " instr=%08x addr=%016lx\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr, (unsigned long)addr);
+		return;
+	}
+
+	pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx"
+		 " instr=%08x addr=%016lx\n",
+		 smp_processor_id(), current->comm, current->pid, regs->nip,
+		 instr, (unsigned long) addr);
+
+	/* Grab instruction "selector" */
+	sel = (instr >> 6) & 3;
+
+	/*
+	 * Check to make sure the facility is actually enabled. This
+	 * could happen if we get a false positive hit.
+	 *
+	 * lxvd2x/lxvw4x always check MSR VSX sel = 0,2
+	 * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3
+	 */
+	msr_mask = MSR_VSX;
+	if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */
+		msr_mask = MSR_VEC;
+	if (!(msr & msr_mask)) {
+		pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx"
+			 " instr=%08x msr:%016lx\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr, msr);
+		return;
+	}
+
+	/* Do logging here before we modify sel based on endian */
+	switch (sel) {
+	case 0:	/* lxvw4x */
+		PPC_WARN_EMULATED(lxvw4x, regs);
+		break;
+	case 1: /* lxvh8x */
+		PPC_WARN_EMULATED(lxvh8x, regs);
+		break;
+	case 2: /* lxvd2x */
+		PPC_WARN_EMULATED(lxvd2x, regs);
+		break;
+	case 3: /* lxvb16x */
+		PPC_WARN_EMULATED(lxvb16x, regs);
+		break;
+	}
+
+#ifdef __LITTLE_ENDIAN__
+	/*
+	 * An LE kernel stores the vector in the task struct as an LE
+	 * byte array (effectively swapping both the components and
+	 * the content of the components). Those instructions expect
+	 * the components to remain in ascending address order, so we
+	 * swap them back.
+	 *
+	 * If we are running a BE user space, the expectation is that
+	 * of a simple memcpy, so forcing the emulation to look like
+	 * a lxvb16x should do the trick.
+	 */
+	if (swap)
+		sel = 3;
+
+	switch (sel) {
+	case 0:	/* lxvw4x */
+		for (i = 0; i < 4; i++)
+			((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i];
+		break;
+	case 1: /* lxvh8x */
+		for (i = 0; i < 8; i++)
+			((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i];
+		break;
+	case 2: /* lxvd2x */
+		for (i = 0; i < 2; i++)
+			((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i];
+		break;
+	case 3: /* lxvb16x */
+		for (i = 0; i < 16; i++)
+			vdst[i] = vbuf[15-i];
+		break;
+	}
+#else /* __LITTLE_ENDIAN__ */
+	/* On a big endian kernel, a BE userspace only needs a memcpy */
+	if (!swap)
+		sel = 3;
+
+	/* Otherwise, we need to swap the content of the components */
+	switch (sel) {
+	case 0:	/* lxvw4x */
+		for (i = 0; i < 4; i++)
+			((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]);
+		break;
+	case 1: /* lxvh8x */
+		for (i = 0; i < 8; i++)
+			((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]);
+		break;
+	case 2: /* lxvd2x */
+		for (i = 0; i < 2; i++)
+			((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]);
+		break;
+	case 3: /* lxvb16x */
+		memcpy(vdst, vbuf, 16);
+		break;
+	}
+#endif /* !__LITTLE_ENDIAN__ */
+
+	/* Go to next instruction */
+	regs->nip += 4;
+}
+#endif /* CONFIG_VSX */
+
+void handle_hmi_exception(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs;
+
+	old_regs = set_irq_regs(regs);
+	irq_enter();
+
+#ifdef CONFIG_VSX
+	/* Real mode flagged P9 special emu is needed */
+	if (local_paca->hmi_p9_special_emu) {
+		local_paca->hmi_p9_special_emu = 0;
+
+		/*
+		 * We don't want to take page faults while doing the
+		 * emulation, we just replay the instruction if necessary.
+		 */
+		pagefault_disable();
+		p9_hmi_special_emu(regs);
+		pagefault_enable();
+	}
+#endif /* CONFIG_VSX */
+
+	if (ppc_md.handle_hmi_exception)
+		ppc_md.handle_hmi_exception(regs);
+
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+
+void unknown_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
+	       regs->nip, regs->msr, regs->trap);
+
+	_exception(SIGTRAP, regs, TRAP_UNK, 0);
+
+	exception_exit(prev_state);
+}
+
+void instruction_breakpoint_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5,
+					5, SIGTRAP) == NOTIFY_STOP)
+		goto bail;
+	if (debugger_iabr_match(regs))
+		goto bail;
+	_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
+
+bail:
+	exception_exit(prev_state);
+}
+
+void RunModeException(struct pt_regs *regs)
+{
+	_exception(SIGTRAP, regs, TRAP_UNK, 0);
+}
+
+void single_step_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	clear_single_step(regs);
+	clear_br_trace(regs);
+
+	if (kprobe_post_handler(regs))
+		return;
+
+	if (notify_die(DIE_SSTEP, "single_step", regs, 5,
+					5, SIGTRAP) == NOTIFY_STOP)
+		goto bail;
+	if (debugger_sstep(regs))
+		goto bail;
+
+	_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+
+bail:
+	exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(single_step_exception);
+
+/*
+ * After we have successfully emulated an instruction, we have to
+ * check if the instruction was being single-stepped, and if so,
+ * pretend we got a single-step exception.  This was pointed out
+ * by Kumar Gala.  -- paulus
+ */
+static void emulate_single_step(struct pt_regs *regs)
+{
+	if (single_stepping(regs))
+		single_step_exception(regs);
+}
+
+static inline int __parse_fpscr(unsigned long fpscr)
+{
+	int ret = FPE_FLTUNK;
+
+	/* Invalid operation */
+	if ((fpscr & FPSCR_VE) && (fpscr & FPSCR_VX))
+		ret = FPE_FLTINV;
+
+	/* Overflow */
+	else if ((fpscr & FPSCR_OE) && (fpscr & FPSCR_OX))
+		ret = FPE_FLTOVF;
+
+	/* Underflow */
+	else if ((fpscr & FPSCR_UE) && (fpscr & FPSCR_UX))
+		ret = FPE_FLTUND;
+
+	/* Divide by zero */
+	else if ((fpscr & FPSCR_ZE) && (fpscr & FPSCR_ZX))
+		ret = FPE_FLTDIV;
+
+	/* Inexact result */
+	else if ((fpscr & FPSCR_XE) && (fpscr & FPSCR_XX))
+		ret = FPE_FLTRES;
+
+	return ret;
+}
+
+static void parse_fpe(struct pt_regs *regs)
+{
+	int code = 0;
+
+	flush_fp_to_thread(current);
+
+	code = __parse_fpscr(current->thread.fp_state.fpscr);
+
+	_exception(SIGFPE, regs, code, regs->nip);
+}
+
+/*
+ * Illegal instruction emulation support.  Originally written to
+ * provide the PVR to user applications using the mfspr rd, PVR.
+ * Return non-zero if we can't emulate, or -EFAULT if the associated
+ * memory access caused an access fault.  Return zero on success.
+ *
+ * There are a couple of ways to do this, either "decode" the instruction
+ * or directly match lots of bits.  In this case, matching lots of
+ * bits is faster and easier.
+ *
+ */
+static int emulate_string_inst(struct pt_regs *regs, u32 instword)
+{
+	u8 rT = (instword >> 21) & 0x1f;
+	u8 rA = (instword >> 16) & 0x1f;
+	u8 NB_RB = (instword >> 11) & 0x1f;
+	u32 num_bytes;
+	unsigned long EA;
+	int pos = 0;
+
+	/* Early out if we are an invalid form of lswx */
+	if ((instword & PPC_INST_STRING_MASK) == PPC_INST_LSWX)
+		if ((rT == rA) || (rT == NB_RB))
+			return -EINVAL;
+
+	EA = (rA == 0) ? 0 : regs->gpr[rA];
+
+	switch (instword & PPC_INST_STRING_MASK) {
+		case PPC_INST_LSWX:
+		case PPC_INST_STSWX:
+			EA += NB_RB;
+			num_bytes = regs->xer & 0x7f;
+			break;
+		case PPC_INST_LSWI:
+		case PPC_INST_STSWI:
+			num_bytes = (NB_RB == 0) ? 32 : NB_RB;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	while (num_bytes != 0)
+	{
+		u8 val;
+		u32 shift = 8 * (3 - (pos & 0x3));
+
+		/* if process is 32-bit, clear upper 32 bits of EA */
+		if ((regs->msr & MSR_64BIT) == 0)
+			EA &= 0xFFFFFFFF;
+
+		switch ((instword & PPC_INST_STRING_MASK)) {
+			case PPC_INST_LSWX:
+			case PPC_INST_LSWI:
+				if (get_user(val, (u8 __user *)EA))
+					return -EFAULT;
+				/* first time updating this reg,
+				 * zero it out */
+				if (pos == 0)
+					regs->gpr[rT] = 0;
+				regs->gpr[rT] |= val << shift;
+				break;
+			case PPC_INST_STSWI:
+			case PPC_INST_STSWX:
+				val = regs->gpr[rT] >> shift;
+				if (put_user(val, (u8 __user *)EA))
+					return -EFAULT;
+				break;
+		}
+		/* move EA to next address */
+		EA += 1;
+		num_bytes--;
+
+		/* manage our position within the register */
+		if (++pos == 4) {
+			pos = 0;
+			if (++rT == 32)
+				rT = 0;
+		}
+	}
+
+	return 0;
+}
+
+static int emulate_popcntb_inst(struct pt_regs *regs, u32 instword)
+{
+	u32 ra,rs;
+	unsigned long tmp;
+
+	ra = (instword >> 16) & 0x1f;
+	rs = (instword >> 21) & 0x1f;
+
+	tmp = regs->gpr[rs];
+	tmp = tmp - ((tmp >> 1) & 0x5555555555555555ULL);
+	tmp = (tmp & 0x3333333333333333ULL) + ((tmp >> 2) & 0x3333333333333333ULL);
+	tmp = (tmp + (tmp >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+	regs->gpr[ra] = tmp;
+
+	return 0;
+}
+
+static int emulate_isel(struct pt_regs *regs, u32 instword)
+{
+	u8 rT = (instword >> 21) & 0x1f;
+	u8 rA = (instword >> 16) & 0x1f;
+	u8 rB = (instword >> 11) & 0x1f;
+	u8 BC = (instword >> 6) & 0x1f;
+	u8 bit;
+	unsigned long tmp;
+
+	tmp = (rA == 0) ? 0 : regs->gpr[rA];
+	bit = (regs->ccr >> (31 - BC)) & 0x1;
+
+	regs->gpr[rT] = bit ? tmp : regs->gpr[rB];
+
+	return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline bool tm_abort_check(struct pt_regs *regs, int cause)
+{
+        /* If we're emulating a load/store in an active transaction, we cannot
+         * emulate it as the kernel operates in transaction suspended context.
+         * We need to abort the transaction.  This creates a persistent TM
+         * abort so tell the user what caused it with a new code.
+	 */
+	if (MSR_TM_TRANSACTIONAL(regs->msr)) {
+		tm_enable();
+		tm_abort(cause);
+		return true;
+	}
+	return false;
+}
+#else
+static inline bool tm_abort_check(struct pt_regs *regs, int reason)
+{
+	return false;
+}
+#endif
+
+static int emulate_instruction(struct pt_regs *regs)
+{
+	u32 instword;
+	u32 rd;
+
+	if (!user_mode(regs))
+		return -EINVAL;
+	CHECK_FULL_REGS(regs);
+
+	if (get_user(instword, (u32 __user *)(regs->nip)))
+		return -EFAULT;
+
+	/* Emulate the mfspr rD, PVR. */
+	if ((instword & PPC_INST_MFSPR_PVR_MASK) == PPC_INST_MFSPR_PVR) {
+		PPC_WARN_EMULATED(mfpvr, regs);
+		rd = (instword >> 21) & 0x1f;
+		regs->gpr[rd] = mfspr(SPRN_PVR);
+		return 0;
+	}
+
+	/* Emulating the dcba insn is just a no-op.  */
+	if ((instword & PPC_INST_DCBA_MASK) == PPC_INST_DCBA) {
+		PPC_WARN_EMULATED(dcba, regs);
+		return 0;
+	}
+
+	/* Emulate the mcrxr insn.  */
+	if ((instword & PPC_INST_MCRXR_MASK) == PPC_INST_MCRXR) {
+		int shift = (instword >> 21) & 0x1c;
+		unsigned long msk = 0xf0000000UL >> shift;
+
+		PPC_WARN_EMULATED(mcrxr, regs);
+		regs->ccr = (regs->ccr & ~msk) | ((regs->xer >> shift) & msk);
+		regs->xer &= ~0xf0000000UL;
+		return 0;
+	}
+
+	/* Emulate load/store string insn. */
+	if ((instword & PPC_INST_STRING_GEN_MASK) == PPC_INST_STRING) {
+		if (tm_abort_check(regs,
+				   TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
+			return -EINVAL;
+		PPC_WARN_EMULATED(string, regs);
+		return emulate_string_inst(regs, instword);
+	}
+
+	/* Emulate the popcntb (Population Count Bytes) instruction. */
+	if ((instword & PPC_INST_POPCNTB_MASK) == PPC_INST_POPCNTB) {
+		PPC_WARN_EMULATED(popcntb, regs);
+		return emulate_popcntb_inst(regs, instword);
+	}
+
+	/* Emulate isel (Integer Select) instruction */
+	if ((instword & PPC_INST_ISEL_MASK) == PPC_INST_ISEL) {
+		PPC_WARN_EMULATED(isel, regs);
+		return emulate_isel(regs, instword);
+	}
+
+	/* Emulate sync instruction variants */
+	if ((instword & PPC_INST_SYNC_MASK) == PPC_INST_SYNC) {
+		PPC_WARN_EMULATED(sync, regs);
+		asm volatile("sync");
+		return 0;
+	}
+
+#ifdef CONFIG_PPC64
+	/* Emulate the mfspr rD, DSCR. */
+	if ((((instword & PPC_INST_MFSPR_DSCR_USER_MASK) ==
+		PPC_INST_MFSPR_DSCR_USER) ||
+	     ((instword & PPC_INST_MFSPR_DSCR_MASK) ==
+		PPC_INST_MFSPR_DSCR)) &&
+			cpu_has_feature(CPU_FTR_DSCR)) {
+		PPC_WARN_EMULATED(mfdscr, regs);
+		rd = (instword >> 21) & 0x1f;
+		regs->gpr[rd] = mfspr(SPRN_DSCR);
+		return 0;
+	}
+	/* Emulate the mtspr DSCR, rD. */
+	if ((((instword & PPC_INST_MTSPR_DSCR_USER_MASK) ==
+		PPC_INST_MTSPR_DSCR_USER) ||
+	     ((instword & PPC_INST_MTSPR_DSCR_MASK) ==
+		PPC_INST_MTSPR_DSCR)) &&
+			cpu_has_feature(CPU_FTR_DSCR)) {
+		PPC_WARN_EMULATED(mtdscr, regs);
+		rd = (instword >> 21) & 0x1f;
+		current->thread.dscr = regs->gpr[rd];
+		current->thread.dscr_inherit = 1;
+		mtspr(SPRN_DSCR, current->thread.dscr);
+		return 0;
+	}
+#endif
+
+	return -EINVAL;
+}
+
+int is_valid_bugaddr(unsigned long addr)
+{
+	return is_kernel_addr(addr);
+}
+
+#ifdef CONFIG_MATH_EMULATION
+static int emulate_math(struct pt_regs *regs)
+{
+	int ret;
+	extern int do_mathemu(struct pt_regs *regs);
+
+	ret = do_mathemu(regs);
+	if (ret >= 0)
+		PPC_WARN_EMULATED(math, regs);
+
+	switch (ret) {
+	case 0:
+		emulate_single_step(regs);
+		return 0;
+	case 1: {
+			int code = 0;
+			code = __parse_fpscr(current->thread.fp_state.fpscr);
+			_exception(SIGFPE, regs, code, regs->nip);
+			return 0;
+		}
+	case -EFAULT:
+		_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
+		return 0;
+	}
+
+	return -1;
+}
+#else
+static inline int emulate_math(struct pt_regs *regs) { return -1; }
+#endif
+
+void program_check_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+	unsigned int reason = get_reason(regs);
+
+	/* We can now get here via a FP Unavailable exception if the core
+	 * has no FPU, in that case the reason flags will be 0 */
+
+	if (reason & REASON_FP) {
+		/* IEEE FP exception */
+		parse_fpe(regs);
+		goto bail;
+	}
+	if (reason & REASON_TRAP) {
+		unsigned long bugaddr;
+		/* Debugger is first in line to stop recursive faults in
+		 * rcu_lock, notify_die, or atomic_notifier_call_chain */
+		if (debugger_bpt(regs))
+			goto bail;
+
+		if (kprobe_handler(regs))
+			goto bail;
+
+		/* trap exception */
+		if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP)
+				== NOTIFY_STOP)
+			goto bail;
+
+		bugaddr = regs->nip;
+		/*
+		 * Fixup bugaddr for BUG_ON() in real mode
+		 */
+		if (!is_kernel_addr(bugaddr) && !(regs->msr & MSR_IR))
+			bugaddr += PAGE_OFFSET;
+
+		if (!(regs->msr & MSR_PR) &&  /* not user-mode */
+		    report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) {
+			regs->nip += 4;
+			goto bail;
+		}
+		_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
+		goto bail;
+	}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (reason & REASON_TM) {
+		/* This is a TM "Bad Thing Exception" program check.
+		 * This occurs when:
+		 * -  An rfid/hrfid/mtmsrd attempts to cause an illegal
+		 *    transition in TM states.
+		 * -  A trechkpt is attempted when transactional.
+		 * -  A treclaim is attempted when non transactional.
+		 * -  A tend is illegally attempted.
+		 * -  writing a TM SPR when transactional.
+		 *
+		 * If usermode caused this, it's done something illegal and
+		 * gets a SIGILL slap on the wrist.  We call it an illegal
+		 * operand to distinguish from the instruction just being bad
+		 * (e.g. executing a 'tend' on a CPU without TM!); it's an
+		 * illegal /placement/ of a valid instruction.
+		 */
+		if (user_mode(regs)) {
+			_exception(SIGILL, regs, ILL_ILLOPN, regs->nip);
+			goto bail;
+		} else {
+			printk(KERN_EMERG "Unexpected TM Bad Thing exception "
+			       "at %lx (msr 0x%x)\n", regs->nip, reason);
+			die("Unrecoverable exception", regs, SIGABRT);
+		}
+	}
+#endif
+
+	/*
+	 * If we took the program check in the kernel skip down to sending a
+	 * SIGILL. The subsequent cases all relate to emulating instructions
+	 * which we should only do for userspace. We also do not want to enable
+	 * interrupts for kernel faults because that might lead to further
+	 * faults, and loose the context of the original exception.
+	 */
+	if (!user_mode(regs))
+		goto sigill;
+
+	/* We restore the interrupt state now */
+	if (!arch_irq_disabled_regs(regs))
+		local_irq_enable();
+
+	/* (reason & REASON_ILLEGAL) would be the obvious thing here,
+	 * but there seems to be a hardware bug on the 405GP (RevD)
+	 * that means ESR is sometimes set incorrectly - either to
+	 * ESR_DST (!?) or 0.  In the process of chasing this with the
+	 * hardware people - not sure if it can happen on any illegal
+	 * instruction or only on FP instructions, whether there is a
+	 * pattern to occurrences etc. -dgibson 31/Mar/2003
+	 */
+	if (!emulate_math(regs))
+		goto bail;
+
+	/* Try to emulate it if we should. */
+	if (reason & (REASON_ILLEGAL | REASON_PRIVILEGED)) {
+		switch (emulate_instruction(regs)) {
+		case 0:
+			regs->nip += 4;
+			emulate_single_step(regs);
+			goto bail;
+		case -EFAULT:
+			_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
+			goto bail;
+		}
+	}
+
+sigill:
+	if (reason & REASON_PRIVILEGED)
+		_exception(SIGILL, regs, ILL_PRVOPC, regs->nip);
+	else
+		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+
+bail:
+	exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(program_check_exception);
+
+/*
+ * This occurs when running in hypervisor mode on POWER6 or later
+ * and an illegal instruction is encountered.
+ */
+void emulation_assist_interrupt(struct pt_regs *regs)
+{
+	regs->msr |= REASON_ILLEGAL;
+	program_check_exception(regs);
+}
+NOKPROBE_SYMBOL(emulation_assist_interrupt);
+
+void alignment_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+	int sig, code, fixed = 0;
+
+	/* We restore the interrupt state now */
+	if (!arch_irq_disabled_regs(regs))
+		local_irq_enable();
+
+	if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT))
+		goto bail;
+
+	/* we don't implement logging of alignment exceptions */
+	if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS))
+		fixed = fix_alignment(regs);
+
+	if (fixed == 1) {
+		regs->nip += 4;	/* skip over emulated instruction */
+		emulate_single_step(regs);
+		goto bail;
+	}
+
+	/* Operand address was bad */
+	if (fixed == -EFAULT) {
+		sig = SIGSEGV;
+		code = SEGV_ACCERR;
+	} else {
+		sig = SIGBUS;
+		code = BUS_ADRALN;
+	}
+	if (user_mode(regs))
+		_exception(sig, regs, code, regs->dar);
+	else
+		bad_page_fault(regs, regs->dar, sig);
+
+bail:
+	exception_exit(prev_state);
+}
+
+void StackOverflow(struct pt_regs *regs)
+{
+	pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n",
+		current->comm, task_pid_nr(current), regs->gpr[1]);
+	debugger(regs);
+	show_regs(regs);
+	panic("kernel stack overflow");
+}
+
+void nonrecoverable_exception(struct pt_regs *regs)
+{
+	printk(KERN_ERR "Non-recoverable exception at PC=%lx MSR=%lx\n",
+	       regs->nip, regs->msr);
+	debugger(regs);
+	die("nonrecoverable exception", regs, SIGKILL);
+}
+
+void kernel_fp_unavailable_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	printk(KERN_EMERG "Unrecoverable FP Unavailable Exception "
+			  "%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable FP Unavailable Exception", regs, SIGABRT);
+
+	exception_exit(prev_state);
+}
+
+void altivec_unavailable_exception(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	if (user_mode(regs)) {
+		/* A user program has executed an altivec instruction,
+		   but this kernel doesn't support altivec. */
+		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+		goto bail;
+	}
+
+	printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception "
+			"%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
+
+bail:
+	exception_exit(prev_state);
+}
+
+void vsx_unavailable_exception(struct pt_regs *regs)
+{
+	if (user_mode(regs)) {
+		/* A user program has executed an vsx instruction,
+		   but this kernel doesn't support vsx. */
+		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+		return;
+	}
+
+	printk(KERN_EMERG "Unrecoverable VSX Unavailable Exception "
+			"%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT);
+}
+
+#ifdef CONFIG_PPC64
+static void tm_unavailable(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (user_mode(regs)) {
+		current->thread.load_tm++;
+		regs->msr |= MSR_TM;
+		tm_enable();
+		tm_restore_sprs(&current->thread);
+		return;
+	}
+#endif
+	pr_emerg("Unrecoverable TM Unavailable Exception "
+			"%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable TM Unavailable Exception", regs, SIGABRT);
+}
+
+void facility_unavailable_exception(struct pt_regs *regs)
+{
+	static char *facility_strings[] = {
+		[FSCR_FP_LG] = "FPU",
+		[FSCR_VECVSX_LG] = "VMX/VSX",
+		[FSCR_DSCR_LG] = "DSCR",
+		[FSCR_PM_LG] = "PMU SPRs",
+		[FSCR_BHRB_LG] = "BHRB",
+		[FSCR_TM_LG] = "TM",
+		[FSCR_EBB_LG] = "EBB",
+		[FSCR_TAR_LG] = "TAR",
+		[FSCR_MSGP_LG] = "MSGP",
+		[FSCR_SCV_LG] = "SCV",
+	};
+	char *facility = "unknown";
+	u64 value;
+	u32 instword, rd;
+	u8 status;
+	bool hv;
+
+	hv = (TRAP(regs) == 0xf80);
+	if (hv)
+		value = mfspr(SPRN_HFSCR);
+	else
+		value = mfspr(SPRN_FSCR);
+
+	status = value >> 56;
+	if ((hv || status >= 2) &&
+	    (status < ARRAY_SIZE(facility_strings)) &&
+	    facility_strings[status])
+		facility = facility_strings[status];
+
+	/* We should not have taken this interrupt in kernel */
+	if (!user_mode(regs)) {
+		pr_emerg("Facility '%s' unavailable (%d) exception in kernel mode at %lx\n",
+			 facility, status, regs->nip);
+		die("Unexpected facility unavailable exception", regs, SIGABRT);
+	}
+
+	/* We restore the interrupt state now */
+	if (!arch_irq_disabled_regs(regs))
+		local_irq_enable();
+
+	if (status == FSCR_DSCR_LG) {
+		/*
+		 * User is accessing the DSCR register using the problem
+		 * state only SPR number (0x03) either through a mfspr or
+		 * a mtspr instruction. If it is a write attempt through
+		 * a mtspr, then we set the inherit bit. This also allows
+		 * the user to write or read the register directly in the
+		 * future by setting via the FSCR DSCR bit. But in case it
+		 * is a read DSCR attempt through a mfspr instruction, we
+		 * just emulate the instruction instead. This code path will
+		 * always emulate all the mfspr instructions till the user
+		 * has attempted at least one mtspr instruction. This way it
+		 * preserves the same behaviour when the user is accessing
+		 * the DSCR through privilege level only SPR number (0x11)
+		 * which is emulated through illegal instruction exception.
+		 * We always leave HFSCR DSCR set.
+		 */
+		if (get_user(instword, (u32 __user *)(regs->nip))) {
+			pr_err("Failed to fetch the user instruction\n");
+			return;
+		}
+
+		/* Write into DSCR (mtspr 0x03, RS) */
+		if ((instword & PPC_INST_MTSPR_DSCR_USER_MASK)
+				== PPC_INST_MTSPR_DSCR_USER) {
+			rd = (instword >> 21) & 0x1f;
+			current->thread.dscr = regs->gpr[rd];
+			current->thread.dscr_inherit = 1;
+			current->thread.fscr |= FSCR_DSCR;
+			mtspr(SPRN_FSCR, current->thread.fscr);
+		}
+
+		/* Read from DSCR (mfspr RT, 0x03) */
+		if ((instword & PPC_INST_MFSPR_DSCR_USER_MASK)
+				== PPC_INST_MFSPR_DSCR_USER) {
+			if (emulate_instruction(regs)) {
+				pr_err("DSCR based mfspr emulation failed\n");
+				return;
+			}
+			regs->nip += 4;
+			emulate_single_step(regs);
+		}
+		return;
+	}
+
+	if (status == FSCR_TM_LG) {
+		/*
+		 * If we're here then the hardware is TM aware because it
+		 * generated an exception with FSRM_TM set.
+		 *
+		 * If cpu_has_feature(CPU_FTR_TM) is false, then either firmware
+		 * told us not to do TM, or the kernel is not built with TM
+		 * support.
+		 *
+		 * If both of those things are true, then userspace can spam the
+		 * console by triggering the printk() below just by continually
+		 * doing tbegin (or any TM instruction). So in that case just
+		 * send the process a SIGILL immediately.
+		 */
+		if (!cpu_has_feature(CPU_FTR_TM))
+			goto out;
+
+		tm_unavailable(regs);
+		return;
+	}
+
+	pr_err_ratelimited("%sFacility '%s' unavailable (%d), exception at 0x%lx, MSR=%lx\n",
+		hv ? "Hypervisor " : "", facility, status, regs->nip, regs->msr);
+
+out:
+	_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+}
+#endif
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+
+void fp_unavailable_tm(struct pt_regs *regs)
+{
+	/* Note:  This does not handle any kind of FP laziness. */
+
+	TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n",
+		 regs->nip, regs->msr);
+
+        /* We can only have got here if the task started using FP after
+         * beginning the transaction.  So, the transactional regs are just a
+         * copy of the checkpointed ones.  But, we still need to recheckpoint
+         * as we're enabling FP for the process; it will return, abort the
+         * transaction, and probably retry but now with FP enabled.  So the
+         * checkpointed FP registers need to be loaded.
+	 */
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
+	/* Reclaim didn't save out any FPRs to transact_fprs. */
+
+	/* Enable FP for the task: */
+	current->thread.load_fp = 1;
+
+	/* This loads and recheckpoints the FP registers from
+	 * thread.fpr[].  They will remain in registers after the
+	 * checkpoint so we don't need to reload them after.
+	 * If VMX is in use, the VRs now hold checkpointed values,
+	 * so we don't want to load the VRs from the thread_struct.
+	 */
+	tm_recheckpoint(&current->thread);
+}
+
+void altivec_unavailable_tm(struct pt_regs *regs)
+{
+	/* See the comments in fp_unavailable_tm().  This function operates
+	 * the same way.
+	 */
+
+	TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx,"
+		 "MSR=%lx\n",
+		 regs->nip, regs->msr);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
+	current->thread.load_vec = 1;
+	tm_recheckpoint(&current->thread);
+	current->thread.used_vr = 1;
+}
+
+void vsx_unavailable_tm(struct pt_regs *regs)
+{
+	/* See the comments in fp_unavailable_tm().  This works similarly,
+	 * though we're loading both FP and VEC registers in here.
+	 *
+	 * If FP isn't in use, load FP regs.  If VEC isn't in use, load VEC
+	 * regs.  Either way, set MSR_VSX.
+	 */
+
+	TM_DEBUG("VSX Unavailable trap whilst transactional at 0x%lx,"
+		 "MSR=%lx\n",
+		 regs->nip, regs->msr);
+
+	current->thread.used_vsr = 1;
+
+	/* This reclaims FP and/or VR regs if they're already enabled */
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
+
+	current->thread.load_vec = 1;
+	current->thread.load_fp = 1;
+
+	tm_recheckpoint(&current->thread);
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+void performance_monitor_exception(struct pt_regs *regs)
+{
+	__this_cpu_inc(irq_stat.pmu_irqs);
+
+	perf_irq(regs);
+}
+
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
+{
+	int changed = 0;
+	/*
+	 * Determine the cause of the debug event, clear the
+	 * event flags and send a trap to the handler. Torez
+	 */
+	if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
+		dbcr_dac(current) &= ~(DBCR_DAC1R | DBCR_DAC1W);
+#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
+		current->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE;
+#endif
+		do_send_trap(regs, mfspr(SPRN_DAC1), debug_status,
+			     5);
+		changed |= 0x01;
+	}  else if (debug_status & (DBSR_DAC2R | DBSR_DAC2W)) {
+		dbcr_dac(current) &= ~(DBCR_DAC2R | DBCR_DAC2W);
+		do_send_trap(regs, mfspr(SPRN_DAC2), debug_status,
+			     6);
+		changed |= 0x01;
+	}  else if (debug_status & DBSR_IAC1) {
+		current->thread.debug.dbcr0 &= ~DBCR0_IAC1;
+		dbcr_iac_range(current) &= ~DBCR_IAC12MODE;
+		do_send_trap(regs, mfspr(SPRN_IAC1), debug_status,
+			     1);
+		changed |= 0x01;
+	}  else if (debug_status & DBSR_IAC2) {
+		current->thread.debug.dbcr0 &= ~DBCR0_IAC2;
+		do_send_trap(regs, mfspr(SPRN_IAC2), debug_status,
+			     2);
+		changed |= 0x01;
+	}  else if (debug_status & DBSR_IAC3) {
+		current->thread.debug.dbcr0 &= ~DBCR0_IAC3;
+		dbcr_iac_range(current) &= ~DBCR_IAC34MODE;
+		do_send_trap(regs, mfspr(SPRN_IAC3), debug_status,
+			     3);
+		changed |= 0x01;
+	}  else if (debug_status & DBSR_IAC4) {
+		current->thread.debug.dbcr0 &= ~DBCR0_IAC4;
+		do_send_trap(regs, mfspr(SPRN_IAC4), debug_status,
+			     4);
+		changed |= 0x01;
+	}
+	/*
+	 * At the point this routine was called, the MSR(DE) was turned off.
+	 * Check all other debug flags and see if that bit needs to be turned
+	 * back on or not.
+	 */
+	if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0,
+			       current->thread.debug.dbcr1))
+		regs->msr |= MSR_DE;
+	else
+		/* Make sure the IDM flag is off */
+		current->thread.debug.dbcr0 &= ~DBCR0_IDM;
+
+	if (changed & 0x01)
+		mtspr(SPRN_DBCR0, current->thread.debug.dbcr0);
+}
+
+void DebugException(struct pt_regs *regs, unsigned long debug_status)
+{
+	current->thread.debug.dbsr = debug_status;
+
+	/* Hack alert: On BookE, Branch Taken stops on the branch itself, while
+	 * on server, it stops on the target of the branch. In order to simulate
+	 * the server behaviour, we thus restart right away with a single step
+	 * instead of stopping here when hitting a BT
+	 */
+	if (debug_status & DBSR_BT) {
+		regs->msr &= ~MSR_DE;
+
+		/* Disable BT */
+		mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_BT);
+		/* Clear the BT event */
+		mtspr(SPRN_DBSR, DBSR_BT);
+
+		/* Do the single step trick only when coming from userspace */
+		if (user_mode(regs)) {
+			current->thread.debug.dbcr0 &= ~DBCR0_BT;
+			current->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+			regs->msr |= MSR_DE;
+			return;
+		}
+
+		if (kprobe_post_handler(regs))
+			return;
+
+		if (notify_die(DIE_SSTEP, "block_step", regs, 5,
+			       5, SIGTRAP) == NOTIFY_STOP) {
+			return;
+		}
+		if (debugger_sstep(regs))
+			return;
+	} else if (debug_status & DBSR_IC) { 	/* Instruction complete */
+		regs->msr &= ~MSR_DE;
+
+		/* Disable instruction completion */
+		mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_IC);
+		/* Clear the instruction completion event */
+		mtspr(SPRN_DBSR, DBSR_IC);
+
+		if (kprobe_post_handler(regs))
+			return;
+
+		if (notify_die(DIE_SSTEP, "single_step", regs, 5,
+			       5, SIGTRAP) == NOTIFY_STOP) {
+			return;
+		}
+
+		if (debugger_sstep(regs))
+			return;
+
+		if (user_mode(regs)) {
+			current->thread.debug.dbcr0 &= ~DBCR0_IC;
+			if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0,
+					       current->thread.debug.dbcr1))
+				regs->msr |= MSR_DE;
+			else
+				/* Make sure the IDM bit is off */
+				current->thread.debug.dbcr0 &= ~DBCR0_IDM;
+		}
+
+		_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+	} else
+		handle_debug(regs, debug_status);
+}
+NOKPROBE_SYMBOL(DebugException);
+#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
+
+#if !defined(CONFIG_TAU_INT)
+void TAUException(struct pt_regs *regs)
+{
+	printk("TAU trap at PC: %lx, MSR: %lx, vector=%lx    %s\n",
+	       regs->nip, regs->msr, regs->trap, print_tainted());
+}
+#endif /* CONFIG_INT_TAU */
+
+#ifdef CONFIG_ALTIVEC
+void altivec_assist_exception(struct pt_regs *regs)
+{
+	int err;
+
+	if (!user_mode(regs)) {
+		printk(KERN_EMERG "VMX/Altivec assist exception in kernel mode"
+		       " at %lx\n", regs->nip);
+		die("Kernel VMX/Altivec assist exception", regs, SIGILL);
+	}
+
+	flush_altivec_to_thread(current);
+
+	PPC_WARN_EMULATED(altivec, regs);
+	err = emulate_altivec(regs);
+	if (err == 0) {
+		regs->nip += 4;		/* skip emulated instruction */
+		emulate_single_step(regs);
+		return;
+	}
+
+	if (err == -EFAULT) {
+		/* got an error reading the instruction */
+		_exception(SIGSEGV, regs, SEGV_ACCERR, regs->nip);
+	} else {
+		/* didn't recognize the instruction */
+		/* XXX quick hack for now: set the non-Java bit in the VSCR */
+		printk_ratelimited(KERN_ERR "Unrecognized altivec instruction "
+				   "in %s at %lx\n", current->comm, regs->nip);
+		current->thread.vr_state.vscr.u[3] |= 0x10000;
+	}
+}
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_FSL_BOOKE
+void CacheLockingException(struct pt_regs *regs, unsigned long address,
+			   unsigned long error_code)
+{
+	/* We treat cache locking instructions from the user
+	 * as priv ops, in the future we could try to do
+	 * something smarter
+	 */
+	if (error_code & (ESR_DLK|ESR_ILK))
+		_exception(SIGILL, regs, ILL_PRVOPC, regs->nip);
+	return;
+}
+#endif /* CONFIG_FSL_BOOKE */
+
+#ifdef CONFIG_SPE
+void SPEFloatingPointException(struct pt_regs *regs)
+{
+	extern int do_spe_mathemu(struct pt_regs *regs);
+	unsigned long spefscr;
+	int fpexc_mode;
+	int code = FPE_FLTUNK;
+	int err;
+
+	flush_spe_to_thread(current);
+
+	spefscr = current->thread.spefscr;
+	fpexc_mode = current->thread.fpexc_mode;
+
+	if ((spefscr & SPEFSCR_FOVF) && (fpexc_mode & PR_FP_EXC_OVF)) {
+		code = FPE_FLTOVF;
+	}
+	else if ((spefscr & SPEFSCR_FUNF) && (fpexc_mode & PR_FP_EXC_UND)) {
+		code = FPE_FLTUND;
+	}
+	else if ((spefscr & SPEFSCR_FDBZ) && (fpexc_mode & PR_FP_EXC_DIV))
+		code = FPE_FLTDIV;
+	else if ((spefscr & SPEFSCR_FINV) && (fpexc_mode & PR_FP_EXC_INV)) {
+		code = FPE_FLTINV;
+	}
+	else if ((spefscr & (SPEFSCR_FG | SPEFSCR_FX)) && (fpexc_mode & PR_FP_EXC_RES))
+		code = FPE_FLTRES;
+
+	err = do_spe_mathemu(regs);
+	if (err == 0) {
+		regs->nip += 4;		/* skip emulated instruction */
+		emulate_single_step(regs);
+		return;
+	}
+
+	if (err == -EFAULT) {
+		/* got an error reading the instruction */
+		_exception(SIGSEGV, regs, SEGV_ACCERR, regs->nip);
+	} else if (err == -EINVAL) {
+		/* didn't recognize the instruction */
+		printk(KERN_ERR "unrecognized spe instruction "
+		       "in %s at %lx\n", current->comm, regs->nip);
+	} else {
+		_exception(SIGFPE, regs, code, regs->nip);
+	}
+
+	return;
+}
+
+void SPEFloatingPointRoundException(struct pt_regs *regs)
+{
+	extern int speround_handler(struct pt_regs *regs);
+	int err;
+
+	preempt_disable();
+	if (regs->msr & MSR_SPE)
+		giveup_spe(current);
+	preempt_enable();
+
+	regs->nip -= 4;
+	err = speround_handler(regs);
+	if (err == 0) {
+		regs->nip += 4;		/* skip emulated instruction */
+		emulate_single_step(regs);
+		return;
+	}
+
+	if (err == -EFAULT) {
+		/* got an error reading the instruction */
+		_exception(SIGSEGV, regs, SEGV_ACCERR, regs->nip);
+	} else if (err == -EINVAL) {
+		/* didn't recognize the instruction */
+		printk(KERN_ERR "unrecognized spe instruction "
+		       "in %s at %lx\n", current->comm, regs->nip);
+	} else {
+		_exception(SIGFPE, regs, FPE_FLTUNK, regs->nip);
+		return;
+	}
+}
+#endif
+
+/*
+ * We enter here if we get an unrecoverable exception, that is, one
+ * that happened at a point where the RI (recoverable interrupt) bit
+ * in the MSR is 0.  This indicates that SRR0/1 are live, and that
+ * we therefore lost state by taking this exception.
+ */
+void unrecoverable_exception(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n",
+	       regs->trap, regs->nip);
+	die("Unrecoverable exception", regs, SIGABRT);
+}
+NOKPROBE_SYMBOL(unrecoverable_exception);
+
+#if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x)
+/*
+ * Default handler for a Watchdog exception,
+ * spins until a reboot occurs
+ */
+void __attribute__ ((weak)) WatchdogHandler(struct pt_regs *regs)
+{
+	/* Generic WatchdogHandler, implement your own */
+	mtspr(SPRN_TCR, mfspr(SPRN_TCR)&(~TCR_WIE));
+	return;
+}
+
+void WatchdogException(struct pt_regs *regs)
+{
+	printk (KERN_EMERG "PowerPC Book-E Watchdog Exception\n");
+	WatchdogHandler(regs);
+}
+#endif
+
+/*
+ * We enter here if we discover during exception entry that we are
+ * running in supervisor mode with a userspace value in the stack pointer.
+ */
+void kernel_bad_stack(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n",
+	       regs->gpr[1], regs->nip);
+	die("Bad kernel stack pointer", regs, SIGABRT);
+}
+NOKPROBE_SYMBOL(kernel_bad_stack);
+
+void __init trap_init(void)
+{
+}
+
+
+#ifdef CONFIG_PPC_EMULATED_STATS
+
+#define WARN_EMULATED_SETUP(type)	.type = { .name = #type }
+
+struct ppc_emulated ppc_emulated = {
+#ifdef CONFIG_ALTIVEC
+	WARN_EMULATED_SETUP(altivec),
+#endif
+	WARN_EMULATED_SETUP(dcba),
+	WARN_EMULATED_SETUP(dcbz),
+	WARN_EMULATED_SETUP(fp_pair),
+	WARN_EMULATED_SETUP(isel),
+	WARN_EMULATED_SETUP(mcrxr),
+	WARN_EMULATED_SETUP(mfpvr),
+	WARN_EMULATED_SETUP(multiple),
+	WARN_EMULATED_SETUP(popcntb),
+	WARN_EMULATED_SETUP(spe),
+	WARN_EMULATED_SETUP(string),
+	WARN_EMULATED_SETUP(sync),
+	WARN_EMULATED_SETUP(unaligned),
+#ifdef CONFIG_MATH_EMULATION
+	WARN_EMULATED_SETUP(math),
+#endif
+#ifdef CONFIG_VSX
+	WARN_EMULATED_SETUP(vsx),
+#endif
+#ifdef CONFIG_PPC64
+	WARN_EMULATED_SETUP(mfdscr),
+	WARN_EMULATED_SETUP(mtdscr),
+	WARN_EMULATED_SETUP(lq_stq),
+	WARN_EMULATED_SETUP(lxvw4x),
+	WARN_EMULATED_SETUP(lxvh8x),
+	WARN_EMULATED_SETUP(lxvd2x),
+	WARN_EMULATED_SETUP(lxvb16x),
+#endif
+};
+
+u32 ppc_warn_emulated;
+
+void ppc_warn_emulated_print(const char *type)
+{
+	pr_warn_ratelimited("%s used emulated %s instruction\n", current->comm,
+			    type);
+}
+
+static int __init ppc_warn_emulated_init(void)
+{
+	struct dentry *dir, *d;
+	unsigned int i;
+	struct ppc_emulated_entry *entries = (void *)&ppc_emulated;
+
+	if (!powerpc_debugfs_root)
+		return -ENODEV;
+
+	dir = debugfs_create_dir("emulated_instructions",
+				 powerpc_debugfs_root);
+	if (!dir)
+		return -ENOMEM;
+
+	d = debugfs_create_u32("do_warn", 0644, dir,
+			       &ppc_warn_emulated);
+	if (!d)
+		goto fail;
+
+	for (i = 0; i < sizeof(ppc_emulated)/sizeof(*entries); i++) {
+		d = debugfs_create_u32(entries[i].name, 0644, dir,
+				       (u32 *)&entries[i].val.counter);
+		if (!d)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	debugfs_remove_recursive(dir);
+	return -ENOMEM;
+}
+
+device_initcall(ppc_warn_emulated_init);
+
+#endif /* CONFIG_PPC_EMULATED_STATS */
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
new file mode 100644
index 000000000..7cc38b5b5
--- /dev/null
+++ b/arch/powerpc/kernel/udbg.c
@@ -0,0 +1,180 @@
+/*
+ * polling mode stateless debugging stuff, originally for NS16550 Serial Ports
+ *
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/udbg.h>
+
+void (*udbg_putc)(char c);
+void (*udbg_flush)(void);
+int (*udbg_getc)(void);
+int (*udbg_getc_poll)(void);
+
+/*
+ * Early debugging facilities. You can enable _one_ of these via .config,
+ * if you do so your kernel _will not boot_ on anything else. Be careful.
+ */
+void __init udbg_early_init(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_LPAR)
+	/* For LPAR machines that have an HVC console on vterm 0 */
+	udbg_init_debug_lpar();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_LPAR_HVSI)
+	/* For LPAR machines that have an HVSI console on vterm 0 */
+	udbg_init_debug_lpar_hvsi();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_G5)
+	/* For use on Apple G5 machines */
+	udbg_init_pmac_realmode();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_RTAS_PANEL)
+	/* RTAS panel debug */
+	udbg_init_rtas_panel();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_RTAS_CONSOLE)
+	/* RTAS console debug */
+	udbg_init_rtas_console();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_MAPLE)
+	/* Maple real mode debug */
+	udbg_init_maple_realmode();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE)
+	udbg_init_pas_realmode();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX)
+	udbg_init_btext();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_44x)
+	/* PPC44x debug */
+	udbg_init_44x_as1();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_40x)
+	/* PPC40x debug */
+	udbg_init_40x_realmode();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_CPM)
+	udbg_init_cpm();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_USBGECKO)
+	udbg_init_usbgecko();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_MEMCONS)
+	/* In memory console */
+	udbg_init_memcons();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_EHV_BC)
+	udbg_init_ehv_bc();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC)
+	udbg_init_ps3gelic();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_RAW)
+	udbg_init_debug_opal_raw();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI)
+	udbg_init_debug_opal_hvsi();
+#endif
+
+#ifdef CONFIG_PPC_EARLY_DEBUG
+	console_loglevel = 10;
+
+	register_early_udbg_console();
+#endif
+}
+
+/* udbg library, used by xmon et al */
+void udbg_puts(const char *s)
+{
+	if (udbg_putc) {
+		char c;
+
+		if (s && *s != '\0') {
+			while ((c = *s++) != '\0')
+				udbg_putc(c);
+		}
+
+		if (udbg_flush)
+			udbg_flush();
+	}
+#if 0
+	else {
+		printk("%s", s);
+	}
+#endif
+}
+
+int udbg_write(const char *s, int n)
+{
+	int remain = n;
+	char c;
+
+	if (!udbg_putc)
+		return 0;
+
+	if (s && *s != '\0') {
+		while (((c = *s++) != '\0') && (remain-- > 0)) {
+			udbg_putc(c);
+		}
+	}
+
+	if (udbg_flush)
+		udbg_flush();
+
+	return n - remain;
+}
+
+#define UDBG_BUFSIZE 256
+void udbg_printf(const char *fmt, ...)
+{
+	char buf[UDBG_BUFSIZE];
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(buf, UDBG_BUFSIZE, fmt, args);
+	udbg_puts(buf);
+	va_end(args);
+}
+
+void __init udbg_progress(char *s, unsigned short hex)
+{
+	udbg_puts(s);
+	udbg_puts("\n");
+}
+
+/*
+ * Early boot console based on udbg
+ */
+static void udbg_console_write(struct console *con, const char *s,
+		unsigned int n)
+{
+	udbg_write(s, n);
+}
+
+static struct console udbg_console = {
+	.name	= "udbg",
+	.write	= udbg_console_write,
+	.flags	= CON_PRINTBUFFER | CON_ENABLED | CON_BOOT | CON_ANYTIME,
+	.index	= 0,
+};
+
+/*
+ * Called by setup_system after ppc_md->probe and ppc_md->early_init.
+ * Call it again after setting udbg_putc in ppc_md->setup_arch.
+ */
+void __init register_early_udbg_console(void)
+{
+	if (early_console)
+		return;
+
+	if (!udbg_putc)
+		return;
+
+	if (strstr(boot_command_line, "udbg-immortal")) {
+		printk(KERN_INFO "early console immortal !\n");
+		udbg_console.flags &= ~CON_BOOT;
+	}
+	early_console = &udbg_console;
+	register_console(&udbg_console);
+}
+
+#if 0   /* if you want to use this as a regular output console */
+console_initcall(register_udbg_console);
+#endif
diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c
new file mode 100644
index 000000000..411116c38
--- /dev/null
+++ b/arch/powerpc/kernel/udbg_16550.c
@@ -0,0 +1,302 @@
+/*
+ * udbg for NS16550 compatible serial ports
+ *
+ * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <asm/udbg.h>
+#include <asm/io.h>
+#include <asm/reg_a2.h>
+
+extern u8 real_readb(volatile u8 __iomem  *addr);
+extern void real_writeb(u8 data, volatile u8 __iomem *addr);
+extern u8 real_205_readb(volatile u8 __iomem  *addr);
+extern void real_205_writeb(u8 data, volatile u8 __iomem *addr);
+
+#define UART_RBR	0
+#define UART_IER	1
+#define UART_FCR	2
+#define UART_LCR	3
+#define UART_MCR	4
+#define UART_LSR	5
+#define UART_MSR	6
+#define UART_SCR	7
+#define UART_THR	UART_RBR
+#define UART_IIR	UART_FCR
+#define UART_DLL	UART_RBR
+#define UART_DLM	UART_IER
+#define UART_DLAB	UART_LCR
+
+#define LSR_DR   0x01  /* Data ready */
+#define LSR_OE   0x02  /* Overrun */
+#define LSR_PE   0x04  /* Parity error */
+#define LSR_FE   0x08  /* Framing error */
+#define LSR_BI   0x10  /* Break */
+#define LSR_THRE 0x20  /* Xmit holding register empty */
+#define LSR_TEMT 0x40  /* Xmitter empty */
+#define LSR_ERR  0x80  /* Error */
+
+#define LCR_DLAB 0x80
+
+static u8 (*udbg_uart_in)(unsigned int reg);
+static void (*udbg_uart_out)(unsigned int reg, u8 data);
+
+static void udbg_uart_flush(void)
+{
+	if (!udbg_uart_in)
+		return;
+
+	/* wait for idle */
+	while ((udbg_uart_in(UART_LSR) & LSR_THRE) == 0)
+		cpu_relax();
+}
+
+static void udbg_uart_putc(char c)
+{
+	if (!udbg_uart_out)
+		return;
+
+	if (c == '\n')
+		udbg_uart_putc('\r');
+	udbg_uart_flush();
+	udbg_uart_out(UART_THR, c);
+}
+
+static int udbg_uart_getc_poll(void)
+{
+	if (!udbg_uart_in)
+		return -1;
+
+	if (!(udbg_uart_in(UART_LSR) & LSR_DR))
+		return udbg_uart_in(UART_RBR);
+
+	return -1;
+}
+
+static int udbg_uart_getc(void)
+{
+	if (!udbg_uart_in)
+		return -1;
+	/* wait for char */
+	while (!(udbg_uart_in(UART_LSR) & LSR_DR))
+		cpu_relax();
+	return udbg_uart_in(UART_RBR);
+}
+
+static void udbg_use_uart(void)
+{
+	udbg_putc = udbg_uart_putc;
+	udbg_flush = udbg_uart_flush;
+	udbg_getc = udbg_uart_getc;
+	udbg_getc_poll = udbg_uart_getc_poll;
+}
+
+void udbg_uart_setup(unsigned int speed, unsigned int clock)
+{
+	unsigned int dll, base_bauds;
+
+	if (!udbg_uart_out)
+		return;
+
+	if (clock == 0)
+		clock = 1843200;
+	if (speed == 0)
+		speed = 9600;
+
+	base_bauds = clock / 16;
+	dll = base_bauds / speed;
+
+	udbg_uart_out(UART_LCR, 0x00);
+	udbg_uart_out(UART_IER, 0xff);
+	udbg_uart_out(UART_IER, 0x00);
+	udbg_uart_out(UART_LCR, LCR_DLAB);
+	udbg_uart_out(UART_DLL, dll & 0xff);
+	udbg_uart_out(UART_DLM, dll >> 8);
+	/* 8 data, 1 stop, no parity */
+	udbg_uart_out(UART_LCR, 0x3);
+	/* RTS/DTR */
+	udbg_uart_out(UART_MCR, 0x3);
+	/* Clear & enable FIFOs */
+	udbg_uart_out(UART_FCR, 0x7);
+}
+
+unsigned int udbg_probe_uart_speed(unsigned int clock)
+{
+	unsigned int dll, dlm, divisor, prescaler, speed;
+	u8 old_lcr;
+
+	old_lcr = udbg_uart_in(UART_LCR);
+
+	/* select divisor latch registers.  */
+	udbg_uart_out(UART_LCR, old_lcr | LCR_DLAB);
+
+	/* now, read the divisor */
+	dll = udbg_uart_in(UART_DLL);
+	dlm = udbg_uart_in(UART_DLM);
+	divisor = dlm << 8 | dll;
+
+	/* check prescaling */
+	if (udbg_uart_in(UART_MCR) & 0x80)
+		prescaler = 4;
+	else
+		prescaler = 1;
+
+	/* restore the LCR */
+	udbg_uart_out(UART_LCR, old_lcr);
+
+	/* calculate speed */
+	speed = (clock / prescaler) / (divisor * 16);
+
+	/* sanity check */
+	if (speed > (clock / 16))
+		speed = 9600;
+
+	return speed;
+}
+
+static union {
+	unsigned char __iomem *mmio_base;
+	unsigned long pio_base;
+} udbg_uart;
+
+static unsigned int udbg_uart_stride = 1;
+
+static u8 udbg_uart_in_pio(unsigned int reg)
+{
+	return inb(udbg_uart.pio_base + (reg * udbg_uart_stride));
+}
+
+static void udbg_uart_out_pio(unsigned int reg, u8 data)
+{
+	outb(data, udbg_uart.pio_base + (reg * udbg_uart_stride));
+}
+
+void udbg_uart_init_pio(unsigned long port, unsigned int stride)
+{
+	if (!port)
+		return;
+	udbg_uart.pio_base = port;
+	udbg_uart_stride = stride;
+	udbg_uart_in = udbg_uart_in_pio;
+	udbg_uart_out = udbg_uart_out_pio;
+	udbg_use_uart();
+}
+
+static u8 udbg_uart_in_mmio(unsigned int reg)
+{
+	return in_8(udbg_uart.mmio_base + (reg * udbg_uart_stride));
+}
+
+static void udbg_uart_out_mmio(unsigned int reg, u8 data)
+{
+	out_8(udbg_uart.mmio_base + (reg * udbg_uart_stride), data);
+}
+
+
+void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride)
+{
+	if (!addr)
+		return;
+	udbg_uart.mmio_base = addr;
+	udbg_uart_stride = stride;
+	udbg_uart_in = udbg_uart_in_mmio;
+	udbg_uart_out = udbg_uart_out_mmio;
+	udbg_use_uart();
+}
+
+#ifdef CONFIG_PPC_MAPLE
+
+#define UDBG_UART_MAPLE_ADDR	((void __iomem *)0xf40003f8)
+
+static u8 udbg_uart_in_maple(unsigned int reg)
+{
+	return real_readb(UDBG_UART_MAPLE_ADDR + reg);
+}
+
+static void udbg_uart_out_maple(unsigned int reg, u8 val)
+{
+	real_writeb(val, UDBG_UART_MAPLE_ADDR + reg);
+}
+
+void __init udbg_init_maple_realmode(void)
+{
+	udbg_uart_in = udbg_uart_in_maple;
+	udbg_uart_out = udbg_uart_out_maple;
+	udbg_use_uart();
+}
+
+#endif /* CONFIG_PPC_MAPLE */
+
+#ifdef CONFIG_PPC_PASEMI
+
+#define UDBG_UART_PAS_ADDR	((void __iomem *)0xfcff03f8UL)
+
+static u8 udbg_uart_in_pas(unsigned int reg)
+{
+	return real_205_readb(UDBG_UART_PAS_ADDR + reg);
+}
+
+static void udbg_uart_out_pas(unsigned int reg, u8 val)
+{
+	real_205_writeb(val, UDBG_UART_PAS_ADDR + reg);
+}
+
+void __init udbg_init_pas_realmode(void)
+{
+	udbg_uart_in = udbg_uart_in_pas;
+	udbg_uart_out = udbg_uart_out_pas;
+	udbg_use_uart();
+}
+
+#endif /* CONFIG_PPC_PASEMI */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_44x
+
+#include <platforms/44x/44x.h>
+
+static u8 udbg_uart_in_44x_as1(unsigned int reg)
+{
+	return as1_readb((void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg);
+}
+
+static void udbg_uart_out_44x_as1(unsigned int reg, u8 val)
+{
+	as1_writeb(val, (void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg);
+}
+
+void __init udbg_init_44x_as1(void)
+{
+	udbg_uart_in = udbg_uart_in_44x_as1;
+	udbg_uart_out = udbg_uart_out_44x_as1;
+	udbg_use_uart();
+}
+
+#endif /* CONFIG_PPC_EARLY_DEBUG_44x */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_40x
+
+static u8 udbg_uart_in_40x(unsigned int reg)
+{
+	return real_readb((void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR
+			  + reg);
+}
+
+static void udbg_uart_out_40x(unsigned int reg, u8 val)
+{
+	real_writeb(val, (void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR
+		    + reg);
+}
+
+void __init udbg_init_40x_realmode(void)
+{
+	udbg_uart_in = udbg_uart_in_40x;
+	udbg_uart_out = udbg_uart_out_40x;
+	udbg_use_uart();
+}
+
+#endif /* CONFIG_PPC_EARLY_DEBUG_40x */
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
new file mode 100644
index 000000000..5d105b8ee
--- /dev/null
+++ b/arch/powerpc/kernel/uprobes.c
@@ -0,0 +1,216 @@
+/*
+ * User-space Probes (UProbes) for powerpc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007-2012
+ *
+ * Adapted from the x86 port by Ananth N Mavinakayanahalli <ananth@in.ibm.com>
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/sstep.h>
+
+#define UPROBE_TRAP_NR	UINT_MAX
+
+/**
+ * is_trap_insn - check if the instruction is a trap variant
+ * @insn: instruction to be checked.
+ * Returns true if @insn is a trap variant.
+ */
+bool is_trap_insn(uprobe_opcode_t *insn)
+{
+	return (is_trap(*insn));
+}
+
+/**
+ * arch_uprobe_analyze_insn
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: vaddr to probe.
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe,
+		struct mm_struct *mm, unsigned long addr)
+{
+	if (addr & 0x03)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct arch_uprobe_task *autask = &current->utask->autask;
+
+	autask->saved_trap_nr = current->thread.trap_nr;
+	current->thread.trap_nr = UPROBE_TRAP_NR;
+	regs->nip = current->utask->xol_vaddr;
+
+	user_enable_single_step(current);
+	return 0;
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+	return instruction_pointer(regs);
+}
+
+/*
+ * If xol insn itself traps and generates a signal (SIGILL/SIGSEGV/etc),
+ * then detect the case where a singlestepped instruction jumps back to its
+ * own address. It is assumed that anything like do_page_fault/do_trap/etc
+ * sets thread.trap_nr != UINT_MAX.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == UINT_MAX set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+	if (t->thread.trap_nr != UPROBE_TRAP_NR)
+		return true;
+
+	return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+
+	current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+	/*
+	 * On powerpc, except for loads and stores, most instructions
+	 * including ones that alter code flow (branches, calls, returns)
+	 * are emulated in the kernel. We get here only if the emulation
+	 * support doesn't exist and have to fix-up the next instruction
+	 * to be executed.
+	 */
+	regs->nip = utask->vaddr + MAX_UINSN_BYTES;
+
+	user_disable_single_step(current);
+	return 0;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	struct die_args *args = data;
+	struct pt_regs *regs = args->regs;
+
+	/* regs == NULL is a kernel bug */
+	if (WARN_ON(!regs))
+		return NOTIFY_DONE;
+
+	/* We are only interested in userspace traps */
+	if (!user_mode(regs))
+		return NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_BPT:
+		if (uprobe_pre_sstep_notifier(regs))
+			return NOTIFY_STOP;
+		break;
+	case DIE_SSTEP:
+		if (uprobe_post_sstep_notifier(regs))
+			return NOTIFY_STOP;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal, so reset the instruction pointer to its
+ * probed address.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	struct uprobe_task *utask = current->utask;
+
+	current->thread.trap_nr = utask->autask.saved_trap_nr;
+	instruction_pointer_set(regs, utask->vaddr);
+
+	user_disable_single_step(current);
+}
+
+/*
+ * See if the instruction can be emulated.
+ * Returns true if instruction was emulated, false otherwise.
+ */
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	int ret;
+
+	/*
+	 * emulate_step() returns 1 if the insn was successfully emulated.
+	 * For all other cases, we need to single-step in hardware.
+	 */
+	ret = emulate_step(regs, auprobe->insn);
+	if (ret > 0)
+		return true;
+
+	return false;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+	unsigned long orig_ret_vaddr;
+
+	orig_ret_vaddr = regs->link;
+
+	/* Replace the return addr with trampoline addr */
+	regs->link = trampoline_vaddr;
+
+	return orig_ret_vaddr;
+}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+				struct pt_regs *regs)
+{
+	if (ctx == RP_CHECK_CHAIN_CALL)
+		return regs->gpr[1] <= ret->stack;
+	else
+		return regs->gpr[1] < ret->stack;
+}
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
new file mode 100644
index 000000000..31ab6eb61
--- /dev/null
+++ b/arch/powerpc/kernel/vdso.c
@@ -0,0 +1,827 @@
+
+/*
+ *    Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/security.h>
+#include <linux/memblock.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/firmware.h>
+#include <asm/vdso.h>
+#include <asm/vdso_datapage.h>
+#include <asm/setup.h>
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/* Max supported size for symbol names */
+#define MAX_SYMNAME	64
+
+/* The alignment of the vDSO */
+#define VDSO_ALIGNMENT	(1 << 16)
+
+static unsigned int vdso32_pages;
+static void *vdso32_kbase;
+static struct page **vdso32_pagelist;
+unsigned long vdso32_sigtramp;
+unsigned long vdso32_rt_sigtramp;
+
+#ifdef CONFIG_VDSO32
+extern char vdso32_start, vdso32_end;
+#endif
+
+#ifdef CONFIG_PPC64
+extern char vdso64_start, vdso64_end;
+static void *vdso64_kbase = &vdso64_start;
+static unsigned int vdso64_pages;
+static struct page **vdso64_pagelist;
+unsigned long vdso64_rt_sigtramp;
+#endif /* CONFIG_PPC64 */
+
+static int vdso_ready;
+
+/*
+ * The vdso data page (aka. systemcfg for old ppc64 fans) is here.
+ * Once the early boot kernel code no longer needs to muck around
+ * with it, it will become dynamically allocated
+ */
+static union {
+	struct vdso_data	data;
+	u8			page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
+struct vdso_data *vdso_data = &vdso_data_store.data;
+
+/* Format of the patch table */
+struct vdso_patch_def
+{
+	unsigned long	ftr_mask, ftr_value;
+	const char	*gen_name;
+	const char	*fix_name;
+};
+
+/* Table of functions to patch based on the CPU type/revision
+ *
+ * Currently, we only change sync_dicache to do nothing on processors
+ * with a coherent icache
+ */
+static struct vdso_patch_def vdso_patches[] = {
+	{
+		CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE,
+		"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
+	},
+#ifdef CONFIG_PPC32
+	{
+		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
+		"__kernel_gettimeofday", NULL
+	},
+	{
+		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
+		"__kernel_clock_gettime", NULL
+	},
+	{
+		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
+		"__kernel_clock_getres", NULL
+	},
+	{
+		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
+		"__kernel_get_tbfreq", NULL
+	},
+	{
+		CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
+		"__kernel_time", NULL
+	},
+#endif
+};
+
+/*
+ * Some infos carried around for each of them during parsing at
+ * boot time.
+ */
+struct lib32_elfinfo
+{
+	Elf32_Ehdr	*hdr;		/* ptr to ELF */
+	Elf32_Sym	*dynsym;	/* ptr to .dynsym section */
+	unsigned long	dynsymsize;	/* size of .dynsym section */
+	char		*dynstr;	/* ptr to .dynstr section */
+	unsigned long	text;		/* offset of .text section in .so */
+};
+
+struct lib64_elfinfo
+{
+	Elf64_Ehdr	*hdr;
+	Elf64_Sym	*dynsym;
+	unsigned long	dynsymsize;
+	char		*dynstr;
+	unsigned long	text;
+};
+
+
+/*
+ * This is called from binfmt_elf, we create the special vma for the
+ * vDSO and insert it into the mm struct tree
+ */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	struct mm_struct *mm = current->mm;
+	struct page **vdso_pagelist;
+	unsigned long vdso_pages;
+	unsigned long vdso_base;
+	int rc;
+
+	if (!vdso_ready)
+		return 0;
+
+#ifdef CONFIG_PPC64
+	if (is_32bit_task()) {
+		vdso_pagelist = vdso32_pagelist;
+		vdso_pages = vdso32_pages;
+		vdso_base = VDSO32_MBASE;
+	} else {
+		vdso_pagelist = vdso64_pagelist;
+		vdso_pages = vdso64_pages;
+		/*
+		 * On 64bit we don't have a preferred map address. This
+		 * allows get_unmapped_area to find an area near other mmaps
+		 * and most likely share a SLB entry.
+		 */
+		vdso_base = 0;
+	}
+#else
+	vdso_pagelist = vdso32_pagelist;
+	vdso_pages = vdso32_pages;
+	vdso_base = VDSO32_MBASE;
+#endif
+
+	current->mm->context.vdso_base = 0;
+
+	/* vDSO has a problem and was disabled, just don't "enable" it for the
+	 * process
+	 */
+	if (vdso_pages == 0)
+		return 0;
+	/* Add a page to the vdso size for the data page */
+	vdso_pages ++;
+
+	/*
+	 * pick a base address for the vDSO in process space. We try to put it
+	 * at vdso_base which is the "natural" base for it, but we might fail
+	 * and end up putting it elsewhere.
+	 * Add enough to the size so that the result can be aligned.
+	 */
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+	vdso_base = get_unmapped_area(NULL, vdso_base,
+				      (vdso_pages << PAGE_SHIFT) +
+				      ((VDSO_ALIGNMENT - 1) & PAGE_MASK),
+				      0, 0);
+	if (IS_ERR_VALUE(vdso_base)) {
+		rc = vdso_base;
+		goto fail_mmapsem;
+	}
+
+	/* Add required alignment. */
+	vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
+
+	/*
+	 * Put vDSO base into mm struct. We need to do this before calling
+	 * install_special_mapping or the perf counter mmap tracking code
+	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
+	 */
+	current->mm->context.vdso_base = vdso_base;
+
+	/*
+	 * our vma flags don't have VM_WRITE so by default, the process isn't
+	 * allowed to write those pages.
+	 * gdb can break that with ptrace interface, and thus trigger COW on
+	 * those pages but it's then your responsibility to never do that on
+	 * the "data" page of the vDSO or you'll stop getting kernel updates
+	 * and your nice userland gettimeofday will be totally dead.
+	 * It's fine to use that for setting breakpoints in the vDSO code
+	 * pages though.
+	 */
+	rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
+				     VM_READ|VM_EXEC|
+				     VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+				     vdso_pagelist);
+	if (rc) {
+		current->mm->context.vdso_base = 0;
+		goto fail_mmapsem;
+	}
+
+	up_write(&mm->mmap_sem);
+	return 0;
+
+ fail_mmapsem:
+	up_write(&mm->mmap_sem);
+	return rc;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso_base)
+		return "[vdso]";
+	return NULL;
+}
+
+
+
+#ifdef CONFIG_VDSO32
+static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
+				  unsigned long *size)
+{
+	Elf32_Shdr *sechdrs;
+	unsigned int i;
+	char *secnames;
+
+	/* Grab section headers and strings so we can tell who is who */
+	sechdrs = (void *)ehdr + ehdr->e_shoff;
+	secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+	/* Find the section they want */
+	for (i = 1; i < ehdr->e_shnum; i++) {
+		if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) {
+			if (size)
+				*size = sechdrs[i].sh_size;
+			return (void *)ehdr + sechdrs[i].sh_offset;
+		}
+	}
+	*size = 0;
+	return NULL;
+}
+
+static Elf32_Sym * __init find_symbol32(struct lib32_elfinfo *lib,
+					const char *symname)
+{
+	unsigned int i;
+	char name[MAX_SYMNAME], *c;
+
+	for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) {
+		if (lib->dynsym[i].st_name == 0)
+			continue;
+		strlcpy(name, lib->dynstr + lib->dynsym[i].st_name,
+			MAX_SYMNAME);
+		c = strchr(name, '@');
+		if (c)
+			*c = 0;
+		if (strcmp(symname, name) == 0)
+			return &lib->dynsym[i];
+	}
+	return NULL;
+}
+
+/* Note that we assume the section is .text and the symbol is relative to
+ * the library base
+ */
+static unsigned long __init find_function32(struct lib32_elfinfo *lib,
+					    const char *symname)
+{
+	Elf32_Sym *sym = find_symbol32(lib, symname);
+
+	if (sym == NULL) {
+		printk(KERN_WARNING "vDSO32: function %s not found !\n",
+		       symname);
+		return 0;
+	}
+	return sym->st_value - VDSO32_LBASE;
+}
+
+static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64,
+				       const char *orig, const char *fix)
+{
+	Elf32_Sym *sym32_gen, *sym32_fix;
+
+	sym32_gen = find_symbol32(v32, orig);
+	if (sym32_gen == NULL) {
+		printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", orig);
+		return -1;
+	}
+	if (fix == NULL) {
+		sym32_gen->st_name = 0;
+		return 0;
+	}
+	sym32_fix = find_symbol32(v32, fix);
+	if (sym32_fix == NULL) {
+		printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", fix);
+		return -1;
+	}
+	sym32_gen->st_value = sym32_fix->st_value;
+	sym32_gen->st_size = sym32_fix->st_size;
+	sym32_gen->st_info = sym32_fix->st_info;
+	sym32_gen->st_other = sym32_fix->st_other;
+	sym32_gen->st_shndx = sym32_fix->st_shndx;
+
+	return 0;
+}
+#else /* !CONFIG_VDSO32 */
+static unsigned long __init find_function32(struct lib32_elfinfo *lib,
+					    const char *symname)
+{
+	return 0;
+}
+
+static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64,
+				       const char *orig, const char *fix)
+{
+	return 0;
+}
+#endif /* CONFIG_VDSO32 */
+
+
+#ifdef CONFIG_PPC64
+
+static void * __init find_section64(Elf64_Ehdr *ehdr, const char *secname,
+				  unsigned long *size)
+{
+	Elf64_Shdr *sechdrs;
+	unsigned int i;
+	char *secnames;
+
+	/* Grab section headers and strings so we can tell who is who */
+	sechdrs = (void *)ehdr + ehdr->e_shoff;
+	secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+	/* Find the section they want */
+	for (i = 1; i < ehdr->e_shnum; i++) {
+		if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) {
+			if (size)
+				*size = sechdrs[i].sh_size;
+			return (void *)ehdr + sechdrs[i].sh_offset;
+		}
+	}
+	if (size)
+		*size = 0;
+	return NULL;
+}
+
+static Elf64_Sym * __init find_symbol64(struct lib64_elfinfo *lib,
+					const char *symname)
+{
+	unsigned int i;
+	char name[MAX_SYMNAME], *c;
+
+	for (i = 0; i < (lib->dynsymsize / sizeof(Elf64_Sym)); i++) {
+		if (lib->dynsym[i].st_name == 0)
+			continue;
+		strlcpy(name, lib->dynstr + lib->dynsym[i].st_name,
+			MAX_SYMNAME);
+		c = strchr(name, '@');
+		if (c)
+			*c = 0;
+		if (strcmp(symname, name) == 0)
+			return &lib->dynsym[i];
+	}
+	return NULL;
+}
+
+/* Note that we assume the section is .text and the symbol is relative to
+ * the library base
+ */
+static unsigned long __init find_function64(struct lib64_elfinfo *lib,
+					    const char *symname)
+{
+	Elf64_Sym *sym = find_symbol64(lib, symname);
+
+	if (sym == NULL) {
+		printk(KERN_WARNING "vDSO64: function %s not found !\n",
+		       symname);
+		return 0;
+	}
+#ifdef VDS64_HAS_DESCRIPTORS
+	return *((u64 *)(vdso64_kbase + sym->st_value - VDSO64_LBASE)) -
+		VDSO64_LBASE;
+#else
+	return sym->st_value - VDSO64_LBASE;
+#endif
+}
+
+static int __init vdso_do_func_patch64(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64,
+				       const char *orig, const char *fix)
+{
+	Elf64_Sym *sym64_gen, *sym64_fix;
+
+	sym64_gen = find_symbol64(v64, orig);
+	if (sym64_gen == NULL) {
+		printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", orig);
+		return -1;
+	}
+	if (fix == NULL) {
+		sym64_gen->st_name = 0;
+		return 0;
+	}
+	sym64_fix = find_symbol64(v64, fix);
+	if (sym64_fix == NULL) {
+		printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", fix);
+		return -1;
+	}
+	sym64_gen->st_value = sym64_fix->st_value;
+	sym64_gen->st_size = sym64_fix->st_size;
+	sym64_gen->st_info = sym64_fix->st_info;
+	sym64_gen->st_other = sym64_fix->st_other;
+	sym64_gen->st_shndx = sym64_fix->st_shndx;
+
+	return 0;
+}
+
+#endif /* CONFIG_PPC64 */
+
+
+static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
+					struct lib64_elfinfo *v64)
+{
+	void *sect;
+
+	/*
+	 * Locate symbol tables & text section
+	 */
+
+#ifdef CONFIG_VDSO32
+	v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize);
+	v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL);
+	if (v32->dynsym == NULL || v32->dynstr == NULL) {
+		printk(KERN_ERR "vDSO32: required symbol section not found\n");
+		return -1;
+	}
+	sect = find_section32(v32->hdr, ".text", NULL);
+	if (sect == NULL) {
+		printk(KERN_ERR "vDSO32: the .text section was not found\n");
+		return -1;
+	}
+	v32->text = sect - vdso32_kbase;
+#endif
+
+#ifdef CONFIG_PPC64
+	v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize);
+	v64->dynstr = find_section64(v64->hdr, ".dynstr", NULL);
+	if (v64->dynsym == NULL || v64->dynstr == NULL) {
+		printk(KERN_ERR "vDSO64: required symbol section not found\n");
+		return -1;
+	}
+	sect = find_section64(v64->hdr, ".text", NULL);
+	if (sect == NULL) {
+		printk(KERN_ERR "vDSO64: the .text section was not found\n");
+		return -1;
+	}
+	v64->text = sect - vdso64_kbase;
+#endif /* CONFIG_PPC64 */
+
+	return 0;
+}
+
+static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
+					  struct lib64_elfinfo *v64)
+{
+	/*
+	 * Find signal trampolines
+	 */
+
+#ifdef CONFIG_PPC64
+	vdso64_rt_sigtramp = find_function64(v64, "__kernel_sigtramp_rt64");
+#endif
+	vdso32_sigtramp	   = find_function32(v32, "__kernel_sigtramp32");
+	vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32");
+}
+
+static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64)
+{
+#ifdef CONFIG_VDSO32
+	Elf32_Sym *sym32;
+#endif
+#ifdef CONFIG_PPC64
+	Elf64_Sym *sym64;
+
+       	sym64 = find_symbol64(v64, "__kernel_datapage_offset");
+	if (sym64 == NULL) {
+		printk(KERN_ERR "vDSO64: Can't find symbol "
+		       "__kernel_datapage_offset !\n");
+		return -1;
+	}
+	*((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) =
+		(vdso64_pages << PAGE_SHIFT) -
+		(sym64->st_value - VDSO64_LBASE);
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_VDSO32
+	sym32 = find_symbol32(v32, "__kernel_datapage_offset");
+	if (sym32 == NULL) {
+		printk(KERN_ERR "vDSO32: Can't find symbol "
+		       "__kernel_datapage_offset !\n");
+		return -1;
+	}
+	*((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
+		(vdso32_pages << PAGE_SHIFT) -
+		(sym32->st_value - VDSO32_LBASE);
+#endif
+
+	return 0;
+}
+
+
+static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
+				      struct lib64_elfinfo *v64)
+{
+	unsigned long size;
+	void *start;
+
+#ifdef CONFIG_PPC64
+	start = find_section64(v64->hdr, "__ftr_fixup", &size);
+	if (start)
+		do_feature_fixups(cur_cpu_spec->cpu_features,
+				  start, start + size);
+
+	start = find_section64(v64->hdr, "__mmu_ftr_fixup", &size);
+	if (start)
+		do_feature_fixups(cur_cpu_spec->mmu_features,
+				  start, start + size);
+
+	start = find_section64(v64->hdr, "__fw_ftr_fixup", &size);
+	if (start)
+		do_feature_fixups(powerpc_firmware_features,
+				  start, start + size);
+
+	start = find_section64(v64->hdr, "__lwsync_fixup", &size);
+	if (start)
+		do_lwsync_fixups(cur_cpu_spec->cpu_features,
+				 start, start + size);
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_VDSO32
+	start = find_section32(v32->hdr, "__ftr_fixup", &size);
+	if (start)
+		do_feature_fixups(cur_cpu_spec->cpu_features,
+				  start, start + size);
+
+	start = find_section32(v32->hdr, "__mmu_ftr_fixup", &size);
+	if (start)
+		do_feature_fixups(cur_cpu_spec->mmu_features,
+				  start, start + size);
+
+#ifdef CONFIG_PPC64
+	start = find_section32(v32->hdr, "__fw_ftr_fixup", &size);
+	if (start)
+		do_feature_fixups(powerpc_firmware_features,
+				  start, start + size);
+#endif /* CONFIG_PPC64 */
+
+	start = find_section32(v32->hdr, "__lwsync_fixup", &size);
+	if (start)
+		do_lwsync_fixups(cur_cpu_spec->cpu_features,
+				 start, start + size);
+#endif
+
+	return 0;
+}
+
+static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vdso_patches); i++) {
+		struct vdso_patch_def *patch = &vdso_patches[i];
+		int match = (cur_cpu_spec->cpu_features & patch->ftr_mask)
+			== patch->ftr_value;
+		if (!match)
+			continue;
+
+		DBG("replacing %s with %s...\n", patch->gen_name,
+		    patch->fix_name ? "NONE" : patch->fix_name);
+
+		/*
+		 * Patch the 32 bits and 64 bits symbols. Note that we do not
+		 * patch the "." symbol on 64 bits.
+		 * It would be easy to do, but doesn't seem to be necessary,
+		 * patching the OPD symbol is enough.
+		 */
+		vdso_do_func_patch32(v32, v64, patch->gen_name,
+				     patch->fix_name);
+#ifdef CONFIG_PPC64
+		vdso_do_func_patch64(v32, v64, patch->gen_name,
+				     patch->fix_name);
+#endif /* CONFIG_PPC64 */
+	}
+
+	return 0;
+}
+
+
+static __init int vdso_setup(void)
+{
+	struct lib32_elfinfo	v32;
+	struct lib64_elfinfo	v64;
+
+	v32.hdr = vdso32_kbase;
+#ifdef CONFIG_PPC64
+	v64.hdr = vdso64_kbase;
+#endif
+	if (vdso_do_find_sections(&v32, &v64))
+		return -1;
+
+	if (vdso_fixup_datapage(&v32, &v64))
+		return -1;
+
+	if (vdso_fixup_features(&v32, &v64))
+		return -1;
+
+	if (vdso_fixup_alt_funcs(&v32, &v64))
+		return -1;
+
+	vdso_setup_trampolines(&v32, &v64);
+
+	return 0;
+}
+
+/*
+ * Called from setup_arch to initialize the bitmap of available
+ * syscalls in the systemcfg page
+ */
+static void __init vdso_setup_syscall_map(void)
+{
+	unsigned int i;
+	extern unsigned long *sys_call_table;
+	extern unsigned long sys_ni_syscall;
+
+
+	for (i = 0; i < NR_syscalls; i++) {
+#ifdef CONFIG_PPC64
+		if (sys_call_table[i*2] != sys_ni_syscall)
+			vdso_data->syscall_map_64[i >> 5] |=
+				0x80000000UL >> (i & 0x1f);
+		if (sys_call_table[i*2+1] != sys_ni_syscall)
+			vdso_data->syscall_map_32[i >> 5] |=
+				0x80000000UL >> (i & 0x1f);
+#else /* CONFIG_PPC64 */
+		if (sys_call_table[i] != sys_ni_syscall)
+			vdso_data->syscall_map_32[i >> 5] |=
+				0x80000000UL >> (i & 0x1f);
+#endif /* CONFIG_PPC64 */
+	}
+}
+
+#ifdef CONFIG_PPC64
+int vdso_getcpu_init(void)
+{
+	unsigned long cpu, node, val;
+
+	/*
+	 * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node
+	 * in the next 16 bits.  The VDSO uses this to implement getcpu().
+	 */
+	cpu = get_cpu();
+	WARN_ON_ONCE(cpu > 0xffff);
+
+	node = cpu_to_node(cpu);
+	WARN_ON_ONCE(node > 0xffff);
+
+	val = (cpu & 0xffff) | ((node & 0xffff) << 16);
+	mtspr(SPRN_SPRG_VDSO_WRITE, val);
+	get_paca()->sprg_vdso = val;
+
+	put_cpu();
+
+	return 0;
+}
+/* We need to call this before SMP init */
+early_initcall(vdso_getcpu_init);
+#endif
+
+static int __init vdso_init(void)
+{
+	int i;
+
+#ifdef CONFIG_PPC64
+	/*
+	 * Fill up the "systemcfg" stuff for backward compatibility
+	 */
+	strcpy((char *)vdso_data->eye_catcher, "SYSTEMCFG:PPC64");
+	vdso_data->version.major = SYSTEMCFG_MAJOR;
+	vdso_data->version.minor = SYSTEMCFG_MINOR;
+	vdso_data->processor = mfspr(SPRN_PVR);
+	/*
+	 * Fake the old platform number for pSeries and add
+	 * in LPAR bit if necessary
+	 */
+	vdso_data->platform = 0x100;
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		vdso_data->platform |= 1;
+	vdso_data->physicalMemorySize = memblock_phys_mem_size();
+	vdso_data->dcache_size = ppc64_caches.l1d.size;
+	vdso_data->dcache_line_size = ppc64_caches.l1d.line_size;
+	vdso_data->icache_size = ppc64_caches.l1i.size;
+	vdso_data->icache_line_size = ppc64_caches.l1i.line_size;
+	vdso_data->dcache_block_size = ppc64_caches.l1d.block_size;
+	vdso_data->icache_block_size = ppc64_caches.l1i.block_size;
+	vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
+	vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
+
+	/*
+	 * Calculate the size of the 64 bits vDSO
+	 */
+	vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
+	DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
+#else
+	vdso_data->dcache_block_size = L1_CACHE_BYTES;
+	vdso_data->dcache_log_block_size = L1_CACHE_SHIFT;
+	vdso_data->icache_block_size = L1_CACHE_BYTES;
+	vdso_data->icache_log_block_size = L1_CACHE_SHIFT;
+#endif /* CONFIG_PPC64 */
+
+
+#ifdef CONFIG_VDSO32
+	vdso32_kbase = &vdso32_start;
+
+	/*
+	 * Calculate the size of the 32 bits vDSO
+	 */
+	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
+	DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
+#endif
+
+
+	/*
+	 * Setup the syscall map in the vDOS
+	 */
+	vdso_setup_syscall_map();
+
+	/*
+	 * Initialize the vDSO images in memory, that is do necessary
+	 * fixups of vDSO symbols, locate trampolines, etc...
+	 */
+	if (vdso_setup()) {
+		printk(KERN_ERR "vDSO setup failure, not enabled !\n");
+		vdso32_pages = 0;
+#ifdef CONFIG_PPC64
+		vdso64_pages = 0;
+#endif
+		return 0;
+	}
+
+#ifdef CONFIG_VDSO32
+	/* Make sure pages are in the correct state */
+	vdso32_pagelist = kcalloc(vdso32_pages + 2, sizeof(struct page *),
+				  GFP_KERNEL);
+	BUG_ON(vdso32_pagelist == NULL);
+	for (i = 0; i < vdso32_pages; i++) {
+		struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		get_page(pg);
+		vdso32_pagelist[i] = pg;
+	}
+	vdso32_pagelist[i++] = virt_to_page(vdso_data);
+	vdso32_pagelist[i] = NULL;
+#endif
+
+#ifdef CONFIG_PPC64
+	vdso64_pagelist = kcalloc(vdso64_pages + 2, sizeof(struct page *),
+				  GFP_KERNEL);
+	BUG_ON(vdso64_pagelist == NULL);
+	for (i = 0; i < vdso64_pages; i++) {
+		struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		get_page(pg);
+		vdso64_pagelist[i] = pg;
+	}
+	vdso64_pagelist[i++] = virt_to_page(vdso_data);
+	vdso64_pagelist[i] = NULL;
+#endif /* CONFIG_PPC64 */
+
+	get_page(virt_to_page(vdso_data));
+
+	smp_wmb();
+	vdso_ready = 1;
+
+	return 0;
+}
+arch_initcall(vdso_init);
diff --git a/arch/powerpc/kernel/vdso32/.gitignore b/arch/powerpc/kernel/vdso32/.gitignore
new file mode 100644
index 000000000..fea580985
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+vdso32.lds
+vdso32.so.dbg
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
new file mode 100644
index 000000000..50112d447
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# List of files in the vdso, has to be asm only for now
+
+obj-vdso32-$(CONFIG_PPC64) = getcpu.o
+obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \
+		$(obj-vdso32-y)
+
+# Build rules
+
+ifdef CROSS32_COMPILE
+    VDSOCC := $(CROSS32_COMPILE)gcc
+else
+    VDSOCC := $(CC)
+endif
+
+CC32FLAGS :=
+ifdef CONFIG_PPC64
+CC32FLAGS += -m32
+endif
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+
+ccflags-y := -shared -fno-common -fno-builtin
+ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
+		$(call cc-ldoption, -Wl$(comma)--hash-style=both)
+asflags-y := -D__VDSO32__ -s
+
+obj-y += vdso32_wrapper.o
+extra-y += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -Upowerpc
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
+	$(call if_changed,vdso32ld)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+# assembly rules for the .S files
+$(obj-vdso32): %.o: %.S FORCE
+	$(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32ld = VDSO32L $@
+      cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
+quiet_cmd_vdso32as = VDSO32A $@
+      cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso32.so: $(obj)/vdso32.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso_install: vdso32.so
diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S
new file mode 100644
index 000000000..1ba6feb71
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/cacheflush.S
@@ -0,0 +1,85 @@
+/*
+ * vDSO provided cache flush routines
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/asm-offsets.h>
+
+	.text
+
+/*
+ * Default "generic" version of __kernel_sync_dicache.
+ *
+ * void __kernel_sync_dicache(unsigned long start, unsigned long end)
+ *
+ * Flushes the data cache & invalidate the instruction cache for the
+ * provided range [start, end[
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+	mr	r11,r3
+	bl	__get_datapage@local
+	mtlr	r12
+	mr	r10,r3
+
+	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
+	addi	r5,r7,-1
+	andc	r6,r11,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
+	srw.	r8,r8,r9		/* compute line count */
+	crclr	cr0*4+so
+	beqlr				/* nothing to do? */
+	mtctr	r8
+1:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	1b
+	sync
+
+/* Now invalidate the instruction cache */
+
+	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
+	addi	r5,r7,-1
+	andc	r6,r11,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5
+	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
+	srw.	r8,r8,r9		/* compute line count */
+	crclr	cr0*4+so
+	beqlr				/* nothing to do? */
+	mtctr	r8
+2:	icbi	0,r6
+	add	r6,r6,r7
+	bdnz	2b
+	isync
+	li	r3,0
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache)
+
+
+/*
+ * POWER5 version of __kernel_sync_dicache
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache_p5)
+  .cfi_startproc
+	crclr	cr0*4+so
+	sync
+	isync
+	li	r3,0
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache_p5)
+
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
new file mode 100644
index 000000000..2a7eb5452
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -0,0 +1,88 @@
+/*
+ * Access to the shared data page by the vDSO & syscall map
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+	.global	__kernel_datapage_offset;
+__kernel_datapage_offset:
+	.long	0
+
+V_FUNCTION_BEGIN(__get_datapage)
+  .cfi_startproc
+	/* We don't want that exposed or overridable as we want other objects
+	 * to be able to bl directly to here
+	 */
+	.protected __get_datapage
+	.hidden __get_datapage
+
+	mflr	r0
+  .cfi_register lr,r0
+
+	bcl	20,31,data_page_branch
+data_page_branch:
+	mflr	r3
+	mtlr	r0
+	addi	r3, r3, __kernel_datapage_offset-data_page_branch
+	lwz	r0,0(r3)
+  .cfi_restore lr
+	add	r3,r0,r3
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__get_datapage)
+
+/*
+ * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
+ *
+ * returns a pointer to the syscall map. the map is agnostic to the
+ * size of "long", unlike kernel bitops, it stores bits from top to
+ * bottom so that memory actually contains a linear bitmap
+ * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of
+ * 32 bits int at N >> 5.
+ */
+V_FUNCTION_BEGIN(__kernel_get_syscall_map)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+	mr	r4,r3
+	bl	__get_datapage@local
+	mtlr	r12
+	addi	r3,r3,CFG_SYSCALL_MAP32
+	cmpli	cr0,r4,0
+	beqlr
+	li	r0,NR_syscalls
+	stw	r0,0(r4)
+	crclr	cr0*4+so
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_get_syscall_map)
+
+/*
+ * void unsigned long long  __kernel_get_tbfreq(void);
+ *
+ * returns the timebase frequency in HZ
+ */
+V_FUNCTION_BEGIN(__kernel_get_tbfreq)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+	bl	__get_datapage@local
+	lwz	r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
+	lwz	r3,CFG_TB_TICKS_PER_SEC(r3)
+	mtlr	r12
+	crclr	cr0*4+so
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_get_tbfreq)
diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S
new file mode 100644
index 000000000..c62be60c7
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/getcpu.S
@@ -0,0 +1,45 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+
+	.text
+/*
+ * Exact prototype of getcpu
+ *
+ * int __kernel_getcpu(unsigned *cpu, unsigned *node);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_getcpu)
+  .cfi_startproc
+	mfspr	r5,SPRN_SPRG_VDSO_READ
+	cmpwi	cr0,r3,0
+	cmpwi	cr1,r4,0
+	clrlwi  r6,r5,16
+	rlwinm  r7,r5,16,31-15,31-0
+	beq	cr0,1f
+	stw	r6,0(r3)
+1:	beq	cr1,2f
+	stw	r7,0(r4)
+2:	crclr	cr0*4+so
+	li	r3,0			/* always success */
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_getcpu)
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
new file mode 100644
index 000000000..49eecd28a
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -0,0 +1,296 @@
+/*
+ * Userland implementation of gettimeofday() for 32 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org,
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+
+/* Offset for the low 32-bit part of a field of long type */
+#ifdef CONFIG_PPC64
+#define LOPART	4
+#define TSPEC_TV_SEC	TSPC64_TV_SEC+LOPART
+#else
+#define LOPART	0
+#define TSPEC_TV_SEC	TSPC32_TV_SEC
+#endif
+
+	.text
+/*
+ * Exact prototype of gettimeofday
+ *
+ * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_gettimeofday)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r10,r3			/* r10 saves tv */
+	mr	r11,r4			/* r11 saves tz */
+	bl	__get_datapage@local	/* get data page */
+	mr	r9, r3			/* datapage ptr in r9 */
+	cmplwi	r10,0			/* check if tv is NULL */
+	beq	3f
+	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
+	addi	r7,r7,1000000@l		/* so we get microseconds in r4 */
+	bl	__do_get_tspec@local	/* get sec/usec from tb & kernel */
+	stw	r3,TVAL32_TV_SEC(r10)
+	stw	r4,TVAL32_TV_USEC(r10)
+
+3:	cmplwi	r11,0			/* check if tz is NULL */
+	beq	1f
+	lwz	r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */
+	lwz	r5,CFG_TZ_DSTTIME(r9)
+	stw	r4,TZONE_TZ_MINWEST(r11)
+	stw	r5,TZONE_TZ_DSTTIME(r11)
+
+1:	mtlr	r12
+	crclr	cr0*4+so
+	li	r3,0
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_gettimeofday)
+
+/*
+ * Exact prototype of clock_gettime()
+ *
+ * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_clock_gettime)
+  .cfi_startproc
+	/* Check for supported clock IDs */
+	cmpli	cr0,r3,CLOCK_REALTIME
+	cmpli	cr1,r3,CLOCK_MONOTONIC
+	cror	cr0*4+eq,cr0*4+eq,cr1*4+eq
+	bne	cr0,99f
+
+	mflr	r12			/* r12 saves lr */
+  .cfi_register lr,r12
+	mr	r11,r4			/* r11 saves tp */
+	bl	__get_datapage@local	/* get data page */
+	mr	r9,r3			/* datapage ptr in r9 */
+	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
+	ori	r7,r7,NSEC_PER_SEC@l
+50:	bl	__do_get_tspec@local	/* get sec/nsec from tb & kernel */
+	bne	cr1,80f			/* not monotonic -> all done */
+
+	/*
+	 * CLOCK_MONOTONIC
+	 */
+
+	/* now we must fixup using wall to monotonic. We need to snapshot
+	 * that value and do the counter trick again. Fortunately, we still
+	 * have the counter value in r8 that was returned by __do_get_xsec.
+	 * At this point, r3,r4 contain our sec/nsec values, r5 and r6
+	 * can be used, r7 contains NSEC_PER_SEC.
+	 */
+
+	lwz	r5,(WTOM_CLOCK_SEC+LOPART)(r9)
+	lwz	r6,WTOM_CLOCK_NSEC(r9)
+
+	/* We now have our offset in r5,r6. We create a fake dependency
+	 * on that value and re-check the counter
+	 */
+	or	r0,r6,r5
+	xor	r0,r0,r0
+	add	r9,r9,r0
+	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
+        cmpl    cr0,r8,r0		/* check if updated */
+	bne-	50b
+
+	/* Calculate and store result. Note that this mimics the C code,
+	 * which may cause funny results if nsec goes negative... is that
+	 * possible at all ?
+	 */
+	add	r3,r3,r5
+	add	r4,r4,r6
+	cmpw	cr0,r4,r7
+	cmpwi	cr1,r4,0
+	blt	1f
+	subf	r4,r7,r4
+	addi	r3,r3,1
+1:	bge	cr1,80f
+	addi	r3,r3,-1
+	add	r4,r4,r7
+
+80:	stw	r3,TSPC32_TV_SEC(r11)
+	stw	r4,TSPC32_TV_NSEC(r11)
+
+	mtlr	r12
+	crclr	cr0*4+so
+	li	r3,0
+	blr
+
+	/*
+	 * syscall fallback
+	 */
+99:
+	li	r0,__NR_clock_gettime
+  .cfi_restore lr
+	sc
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_clock_gettime)
+
+
+/*
+ * Exact prototype of clock_getres()
+ *
+ * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_clock_getres)
+  .cfi_startproc
+	/* Check for supported clock IDs */
+	cmpwi	cr0,r3,CLOCK_REALTIME
+	cmpwi	cr1,r3,CLOCK_MONOTONIC
+	cror	cr0*4+eq,cr0*4+eq,cr1*4+eq
+	bne	cr0,99f
+
+	mflr	r12
+  .cfi_register lr,r12
+	bl	__get_datapage@local	/* get data page */
+	lwz	r5, CLOCK_HRTIMER_RES(r3)
+	mtlr	r12
+	li	r3,0
+	cmpli	cr0,r4,0
+	crclr	cr0*4+so
+	beqlr
+	stw	r3,TSPC32_TV_SEC(r4)
+	stw	r5,TSPC32_TV_NSEC(r4)
+	blr
+
+	/*
+	 * syscall fallback
+	 */
+99:
+	li	r0,__NR_clock_getres
+	sc
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_clock_getres)
+
+
+/*
+ * Exact prototype of time()
+ *
+ * time_t time(time *t);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_time)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r11,r3			/* r11 holds t */
+	bl	__get_datapage@local
+	mr	r9, r3			/* datapage ptr in r9 */
+
+	lwz	r3,STAMP_XTIME+TSPEC_TV_SEC(r9)
+
+	cmplwi	r11,0			/* check if t is NULL */
+	beq	2f
+	stw	r3,0(r11)		/* store result at *t */
+2:	mtlr	r12
+	crclr	cr0*4+so
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_time)
+
+/*
+ * This is the core of clock_gettime() and gettimeofday(),
+ * it returns the current time in r3 (seconds) and r4.
+ * On entry, r7 gives the resolution of r4, either USEC_PER_SEC
+ * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds.
+ * It expects the datapage ptr in r9 and doesn't clobber it.
+ * It clobbers r0, r5 and r6.
+ * On return, r8 contains the counter value that can be reused.
+ * This clobbers cr0 but not any other cr field.
+ */
+__do_get_tspec:
+  .cfi_startproc
+	/* Check for update count & load values. We use the low
+	 * order 32 bits of the update count
+	 */
+1:	lwz	r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
+	andi.	r0,r8,1			/* pending update ? loop */
+	bne-	1b
+	xor	r0,r8,r8		/* create dependency */
+	add	r9,r9,r0
+
+	/* Load orig stamp (offset to TB) */
+	lwz	r5,CFG_TB_ORIG_STAMP(r9)
+	lwz	r6,(CFG_TB_ORIG_STAMP+4)(r9)
+
+	/* Get a stable TB value */
+2:	MFTBU(r3)
+	MFTBL(r4)
+	MFTBU(r0)
+	cmplw	cr0,r3,r0
+	bne-	2b
+
+	/* Subtract tb orig stamp and shift left 12 bits.
+	 */
+	subfc	r4,r6,r4
+	subfe	r0,r5,r3
+	slwi	r0,r0,12
+	rlwimi.	r0,r4,12,20,31
+	slwi	r4,r4,12
+
+	/*
+	 * Load scale factor & do multiplication.
+	 * We only use the high 32 bits of the tb_to_xs value.
+	 * Even with a 1GHz timebase clock, the high 32 bits of
+	 * tb_to_xs will be at least 4 million, so the error from
+	 * ignoring the low 32 bits will be no more than 0.25ppm.
+	 * The error will just make the clock run very very slightly
+	 * slow until the next time the kernel updates the VDSO data,
+	 * at which point the clock will catch up to the kernel's value,
+	 * so there is no long-term error accumulation.
+	 */
+	lwz	r5,CFG_TB_TO_XS(r9)	/* load values */
+	mulhwu	r4,r4,r5
+	li	r3,0
+
+	beq+	4f			/* skip high part computation if 0 */
+	mulhwu	r3,r0,r5
+	mullw	r5,r0,r5
+	addc	r4,r4,r5
+	addze	r3,r3
+4:
+	/* At this point, we have seconds since the xtime stamp
+	 * as a 32.32 fixed-point number in r3 and r4.
+	 * Load & add the xtime stamp.
+	 */
+	lwz	r5,STAMP_XTIME+TSPEC_TV_SEC(r9)
+	lwz	r6,STAMP_SEC_FRAC(r9)
+	addc	r4,r4,r6
+	adde	r3,r3,r5
+
+	/* We create a fake dependency on the result in r3/r4
+	 * and re-check the counter
+	 */
+	or	r6,r4,r3
+	xor	r0,r6,r6
+	add	r9,r9,r0
+	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
+        cmplw	cr0,r8,r0		/* check if updated */
+	bne-	1b
+
+	mulhwu	r4,r4,r7		/* convert to micro or nanoseconds */
+
+	blr
+  .cfi_endproc
diff --git a/arch/powerpc/kernel/vdso32/note.S b/arch/powerpc/kernel/vdso32/note.S
new file mode 100644
index 000000000..227a73273
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/note.S
@@ -0,0 +1,28 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/build-salt.h>
+
+#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
+	.section name, flags;						      \
+	.balign 4;							      \
+	.long 1f - 0f;		/* name length */			      \
+	.long 3f - 2f;		/* data length */			      \
+	.long type;		/* note type */				      \
+0:	.asciz vendor;		/* vendor name */			      \
+1:	.balign 4;							      \
+2:
+
+#define ASM_ELF_NOTE_END						      \
+3:	.balign 4;		/* pad out section */			      \
+	.previous
+
+	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
+	.long LINUX_VERSION_CODE
+	ASM_ELF_NOTE_END
+
+BUILD_SALT
diff --git a/arch/powerpc/kernel/vdso32/sigtramp.S b/arch/powerpc/kernel/vdso32/sigtramp.S
new file mode 100644
index 000000000..cf0c9c9c2
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/sigtramp.S
@@ -0,0 +1,299 @@
+/*
+ * Signal trampolines for 32 bits processes in a ppc64 kernel for
+ * use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+
+/* The nop here is a hack.  The dwarf2 unwind routines subtract 1 from
+   the return address to get an address in the middle of the presumed
+   call instruction.  Since we don't have a call here, we artificially
+   extend the range covered by the unwind info by adding a nop before
+   the real start.  */
+	nop
+V_FUNCTION_BEGIN(__kernel_sigtramp32)
+.Lsig_start = . - 4
+	li	r0,__NR_sigreturn
+	sc
+.Lsig_end:
+V_FUNCTION_END(__kernel_sigtramp32)
+
+.Lsigrt_start:
+	nop
+V_FUNCTION_BEGIN(__kernel_sigtramp_rt32)
+	li	r0,__NR_rt_sigreturn
+	sc
+.Lsigrt_end:
+V_FUNCTION_END(__kernel_sigtramp_rt32)
+
+	.section .eh_frame,"a",@progbits
+
+/* Register r1 can be found at offset 4 of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define cfa_save \
+  .byte 0x0f;			/* DW_CFA_def_cfa_expression */		\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 RSIZE;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+9:
+
+/* Register REGNO can be found at offset OFS of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define rsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .ifne ofs;								\
+    .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+  .endif;								\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  The VMX reg struct is at offset VREGS of
+   the pt_regs struct.  This macro is for REGNO == 0, and contains
+   'subroutines' that the other macros jump to.  */
+#define vsave_msr0(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit0 */			\
+2:									\
+  .byte 0x40;			/*     DW_OP_lit16 */			\
+  .byte 0x1e;			/*     DW_OP_mul */			\
+3:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x12;			/*     DW_OP_dup */			\
+  .byte 0x23;			/*     DW_OP_plus_uconst */		\
+    .uleb128 33*RSIZE;		/*       msr offset */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x0c; .long 1 << 25;	/*     DW_OP_const4u */			\
+  .byte 0x1a;			/*     DW_OP_and */			\
+  .byte 0x12;			/*     DW_OP_dup, ret 0 if bra taken */	\
+  .byte 0x30;			/*     DW_OP_lit0 */			\
+  .byte 0x29;			/*     DW_OP_eq */			\
+  .byte 0x28; .short 0x7fff;	/*     DW_OP_bra to end */		\
+  .byte 0x13;			/*     DW_OP_drop, pop the 0 */		\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x22;			/*     DW_OP_plus */			\
+  .byte 0x2f; .short 0x7fff;	/*     DW_OP_skip to end */		\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  REGNO is 1 thru 31.  */
+#define vsave_msr1(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit n */			\
+  .byte 0x2f; .short 2b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of
+   the VMX save block.  */
+#define vsave_msr2(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x0a; .short ofs;	/*     DW_OP_const2u */			\
+  .byte 0x2f; .short 3b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* VMX register REGNO is at offset OFS of the VMX save area.  */
+#define vsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+9:
+
+/* This is where the pt_regs pointer can be found on the stack.  */
+#define PTREGS 64+28
+
+/* Size of regs.  */
+#define RSIZE 4
+
+/* This is the offset of the VMX regs.  */
+#define VREGS 48*RSIZE+34*8
+
+/* Describe where general purpose regs are saved.  */
+#define EH_FRAME_GEN \
+  cfa_save;								\
+  rsave ( 0,  0*RSIZE);							\
+  rsave ( 2,  2*RSIZE);							\
+  rsave ( 3,  3*RSIZE);							\
+  rsave ( 4,  4*RSIZE);							\
+  rsave ( 5,  5*RSIZE);							\
+  rsave ( 6,  6*RSIZE);							\
+  rsave ( 7,  7*RSIZE);							\
+  rsave ( 8,  8*RSIZE);							\
+  rsave ( 9,  9*RSIZE);							\
+  rsave (10, 10*RSIZE);							\
+  rsave (11, 11*RSIZE);							\
+  rsave (12, 12*RSIZE);							\
+  rsave (13, 13*RSIZE);							\
+  rsave (14, 14*RSIZE);							\
+  rsave (15, 15*RSIZE);							\
+  rsave (16, 16*RSIZE);							\
+  rsave (17, 17*RSIZE);							\
+  rsave (18, 18*RSIZE);							\
+  rsave (19, 19*RSIZE);							\
+  rsave (20, 20*RSIZE);							\
+  rsave (21, 21*RSIZE);							\
+  rsave (22, 22*RSIZE);							\
+  rsave (23, 23*RSIZE);							\
+  rsave (24, 24*RSIZE);							\
+  rsave (25, 25*RSIZE);							\
+  rsave (26, 26*RSIZE);							\
+  rsave (27, 27*RSIZE);							\
+  rsave (28, 28*RSIZE);							\
+  rsave (29, 29*RSIZE);							\
+  rsave (30, 30*RSIZE);							\
+  rsave (31, 31*RSIZE);							\
+  rsave (67, 32*RSIZE);		/* ap, used as temp for nip */		\
+  rsave (65, 36*RSIZE);		/* lr */				\
+  rsave (70, 38*RSIZE)		/* cr */
+
+/* Describe where the FP regs are saved.  */
+#define EH_FRAME_FP \
+  rsave (32, 48*RSIZE +  0*8);						\
+  rsave (33, 48*RSIZE +  1*8);						\
+  rsave (34, 48*RSIZE +  2*8);						\
+  rsave (35, 48*RSIZE +  3*8);						\
+  rsave (36, 48*RSIZE +  4*8);						\
+  rsave (37, 48*RSIZE +  5*8);						\
+  rsave (38, 48*RSIZE +  6*8);						\
+  rsave (39, 48*RSIZE +  7*8);						\
+  rsave (40, 48*RSIZE +  8*8);						\
+  rsave (41, 48*RSIZE +  9*8);						\
+  rsave (42, 48*RSIZE + 10*8);						\
+  rsave (43, 48*RSIZE + 11*8);						\
+  rsave (44, 48*RSIZE + 12*8);						\
+  rsave (45, 48*RSIZE + 13*8);						\
+  rsave (46, 48*RSIZE + 14*8);						\
+  rsave (47, 48*RSIZE + 15*8);						\
+  rsave (48, 48*RSIZE + 16*8);						\
+  rsave (49, 48*RSIZE + 17*8);						\
+  rsave (50, 48*RSIZE + 18*8);						\
+  rsave (51, 48*RSIZE + 19*8);						\
+  rsave (52, 48*RSIZE + 20*8);						\
+  rsave (53, 48*RSIZE + 21*8);						\
+  rsave (54, 48*RSIZE + 22*8);						\
+  rsave (55, 48*RSIZE + 23*8);						\
+  rsave (56, 48*RSIZE + 24*8);						\
+  rsave (57, 48*RSIZE + 25*8);						\
+  rsave (58, 48*RSIZE + 26*8);						\
+  rsave (59, 48*RSIZE + 27*8);						\
+  rsave (60, 48*RSIZE + 28*8);						\
+  rsave (61, 48*RSIZE + 29*8);						\
+  rsave (62, 48*RSIZE + 30*8);						\
+  rsave (63, 48*RSIZE + 31*8)
+
+/* Describe where the VMX regs are saved.  */
+#ifdef CONFIG_ALTIVEC
+#define EH_FRAME_VMX \
+  vsave_msr0 ( 0);							\
+  vsave_msr1 ( 1);							\
+  vsave_msr1 ( 2);							\
+  vsave_msr1 ( 3);							\
+  vsave_msr1 ( 4);							\
+  vsave_msr1 ( 5);							\
+  vsave_msr1 ( 6);							\
+  vsave_msr1 ( 7);							\
+  vsave_msr1 ( 8);							\
+  vsave_msr1 ( 9);							\
+  vsave_msr1 (10);							\
+  vsave_msr1 (11);							\
+  vsave_msr1 (12);							\
+  vsave_msr1 (13);							\
+  vsave_msr1 (14);							\
+  vsave_msr1 (15);							\
+  vsave_msr1 (16);							\
+  vsave_msr1 (17);							\
+  vsave_msr1 (18);							\
+  vsave_msr1 (19);							\
+  vsave_msr1 (20);							\
+  vsave_msr1 (21);							\
+  vsave_msr1 (22);							\
+  vsave_msr1 (23);							\
+  vsave_msr1 (24);							\
+  vsave_msr1 (25);							\
+  vsave_msr1 (26);							\
+  vsave_msr1 (27);							\
+  vsave_msr1 (28);							\
+  vsave_msr1 (29);							\
+  vsave_msr1 (30);							\
+  vsave_msr1 (31);							\
+  vsave_msr2 (33, 32*16+12);						\
+  vsave      (32, 32*16)
+#else
+#define EH_FRAME_VMX
+#endif
+
+.Lcie:
+	.long .Lcie_end - .Lcie_start
+.Lcie_start:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zRS"		/* NUL-terminated augmentation string */
+	.uleb128 4		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 67		/* Return address register column, ap */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel | DW_EH_PE_sdata4. */
+	.byte 0x0c,1,0		/* DW_CFA_def_cfa: r1 ofs 0 */
+	.balign 4
+.Lcie_end:
+
+	.long .Lfde0_end - .Lfde0_start
+.Lfde0_start:
+	.long .Lfde0_start - .Lcie	/* CIE pointer. */
+	.long .Lsig_start - .		/* PC start, length */
+	.long .Lsig_end - .Lsig_start
+	.uleb128 0			/* Augmentation */
+	EH_FRAME_GEN
+	EH_FRAME_FP
+	EH_FRAME_VMX
+	.balign 4
+.Lfde0_end:
+
+/* We have a different stack layout for rt_sigreturn.  */
+#undef PTREGS
+#define PTREGS 64+16+128+20+28
+
+	.long .Lfde1_end - .Lfde1_start
+.Lfde1_start:
+	.long .Lfde1_start - .Lcie	/* CIE pointer. */
+	.long .Lsigrt_start - .		/* PC start, length */
+	.long .Lsigrt_end - .Lsigrt_start
+	.uleb128 0			/* Augmentation */
+	EH_FRAME_GEN
+	EH_FRAME_FP
+	EH_FRAME_VMX
+	.balign 4
+.Lfde1_end:
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 000000000..099a6db14
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 32 bits vdso
+ * library
+ */
+#include <asm/vdso.h>
+
+#ifdef __LITTLE_ENDIAN__
+OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle")
+#else
+OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
+#endif
+OUTPUT_ARCH(powerpc:common)
+ENTRY(_start)
+
+SECTIONS
+{
+	. = VDSO32_LBASE + SIZEOF_HEADERS;
+
+	.hash          	: { *(.hash) }			:text
+	.gnu.hash      	: { *(.gnu.hash) }
+	.dynsym        	: { *(.dynsym) }
+	.dynstr        	: { *(.dynstr) }
+	.gnu.version   	: { *(.gnu.version) }
+	.gnu.version_d 	: { *(.gnu.version_d) }
+	.gnu.version_r 	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+
+	. = ALIGN(16);
+	.text		: {
+		*(.text .stub .text.* .gnu.linkonce.t.* __ftr_alt_*)
+	}		      	      			:text
+	PROVIDE(__etext = .);
+	PROVIDE(_etext = .);
+	PROVIDE(etext = .);
+
+	. = ALIGN(8);
+	__ftr_fixup	: { *(__ftr_fixup) }
+
+	. = ALIGN(8);
+	__mmu_ftr_fixup	: { *(__mmu_ftr_fixup) }
+
+	. = ALIGN(8);
+	__lwsync_fixup	: { *(__lwsync_fixup) }
+
+#ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__fw_ftr_fixup	: { *(__fw_ftr_fixup) }
+#endif
+
+	/*
+	 * Other stuff is appended to the text segment:
+	 */
+	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+	.rodata1	: { *(.rodata1) }
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+	.gcc_except_table : { *(.gcc_except_table) }
+	.fixup		: { *(.fixup) }
+
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+	.got		: { *(.got) }			:text
+	.plt		: { *(.plt) }
+
+	_end = .;
+	__end = .;
+	PROVIDE(end = .);
+
+	/*
+	 * Stabs debugging sections are here too.
+	 */
+	.stab 0 : { *(.stab) }
+	.stabstr 0 : { *(.stabstr) }
+	.stab.excl 0 : { *(.stab.excl) }
+	.stab.exclstr 0 : { *(.stab.exclstr) }
+	.stab.index 0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+
+	/*
+	 * DWARF debug sections.
+	 * Symbols in the DWARF debugging sections are relative to the beginning
+	 * of the section so we begin them at 0.
+	 */
+	/* DWARF 1 */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2 */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2 */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+
+	/DISCARD/	: {
+		*(.note.GNU-stack)
+		*(.data .data.* .gnu.linkonce.d.* .sdata*)
+		*(.bss .sbss .dynbss .dynsbss)
+	}
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME	0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD FILEHDR PHDRS FLAGS(5);	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC FLAGS(4);		/* PF_R */
+	note		PT_NOTE FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		/*
+		 * Has to be there for the kernel to find
+		 */
+		__kernel_datapage_offset;
+
+		__kernel_get_syscall_map;
+		__kernel_gettimeofday;
+		__kernel_clock_gettime;
+		__kernel_clock_getres;
+		__kernel_get_tbfreq;
+		__kernel_sync_dicache;
+		__kernel_sync_dicache_p5;
+		__kernel_sigtramp32;
+		__kernel_sigtramp_rt32;
+#ifdef CONFIG_PPC64
+		__kernel_getcpu;
+#endif
+		__kernel_time;
+
+	local: *;
+	};
+}
diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 000000000..3f5ef035b
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.globl vdso32_start, vdso32_end
+	.balign PAGE_SIZE
+vdso32_start:
+	.incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg"
+	.balign PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/powerpc/kernel/vdso64/.gitignore b/arch/powerpc/kernel/vdso64/.gitignore
new file mode 100644
index 000000000..77a0b4236
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/.gitignore
@@ -0,0 +1,2 @@
+vdso64.lds
+vdso64.so.dbg
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
new file mode 100644
index 000000000..69cecb346
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso, has to be asm only for now
+
+obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
+
+# Build rules
+
+targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
+obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+
+ccflags-y := -shared -fno-common -fno-builtin
+ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
+		$(call cc-ldoption, -Wl$(comma)--hash-style=both)
+asflags-y := -D__VDSO64__ -s
+
+obj-y += vdso64_wrapper.o
+extra-y += vdso64.lds
+CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
+
+# Force dependency (incbin is bad)
+$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
+
+# link rule for the .so file, .lds has to be first
+$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
+	$(call if_changed,vdso64ld)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+# actual build commands
+quiet_cmd_vdso64ld = VDSO64L $@
+      cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso64.so: $(obj)/vdso64.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso_install: vdso64.so
diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
new file mode 100644
index 000000000..228a4a238
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/cacheflush.S
@@ -0,0 +1,84 @@
+/*
+ * vDSO provided cache flush routines
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/asm-offsets.h>
+
+	.text
+
+/*
+ * Default "generic" version of __kernel_sync_dicache.
+ *
+ * void __kernel_sync_dicache(unsigned long start, unsigned long end)
+ *
+ * Flushes the data cache & invalidate the instruction cache for the
+ * provided range [start, end[
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+	mr	r11,r3
+	bl	V_LOCAL_FUNC(__get_datapage)
+	mtlr	r12
+	mr	r10,r3
+
+	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
+	addi	r5,r7,-1
+	andc	r6,r11,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5		/* ensure we get enough */
+	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
+	srd.	r8,r8,r9		/* compute line count */
+	crclr	cr0*4+so
+	beqlr				/* nothing to do? */
+	mtctr	r8
+1:	dcbst	0,r6
+	add	r6,r6,r7
+	bdnz	1b
+	sync
+
+/* Now invalidate the instruction cache */
+
+	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
+	addi	r5,r7,-1
+	andc	r6,r11,r5		/* round low to line bdy */
+	subf	r8,r6,r4		/* compute length */
+	add	r8,r8,r5
+	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
+	srd.	r8,r8,r9		/* compute line count */
+	crclr	cr0*4+so
+	beqlr				/* nothing to do? */
+	mtctr	r8
+2:	icbi	0,r6
+	add	r6,r6,r7
+	bdnz	2b
+	isync
+	li	r3,0
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache)
+
+
+/*
+ * POWER5 version of __kernel_sync_dicache
+ */
+V_FUNCTION_BEGIN(__kernel_sync_dicache_p5)
+  .cfi_startproc
+	crclr	cr0*4+so
+	sync
+	isync
+	li	r3,0
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_sync_dicache_p5)
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
new file mode 100644
index 000000000..bf9668691
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -0,0 +1,88 @@
+/*
+ * Access to the shared data page by the vDSO & syscall map
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+
+	.text
+.global	__kernel_datapage_offset;
+__kernel_datapage_offset:
+	.long	0
+
+V_FUNCTION_BEGIN(__get_datapage)
+  .cfi_startproc
+	/* We don't want that exposed or overridable as we want other objects
+	 * to be able to bl directly to here
+	 */
+	.protected __get_datapage
+	.hidden __get_datapage
+
+	mflr	r0
+  .cfi_register lr,r0
+
+	bcl	20,31,data_page_branch
+data_page_branch:
+	mflr	r3
+	mtlr	r0
+	addi	r3, r3, __kernel_datapage_offset-data_page_branch
+	lwz	r0,0(r3)
+  .cfi_restore lr
+	add	r3,r0,r3
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__get_datapage)
+
+/*
+ * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
+ *
+ * returns a pointer to the syscall map. the map is agnostic to the
+ * size of "long", unlike kernel bitops, it stores bits from top to
+ * bottom so that memory actually contains a linear bitmap
+ * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of
+ * 32 bits int at N >> 5.
+ */
+V_FUNCTION_BEGIN(__kernel_get_syscall_map)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+	mr	r4,r3
+	bl	V_LOCAL_FUNC(__get_datapage)
+	mtlr	r12
+	addi	r3,r3,CFG_SYSCALL_MAP64
+	cmpldi	cr0,r4,0
+	crclr	cr0*4+so
+	beqlr
+	li	r0,NR_syscalls
+	stw	r0,0(r4)
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_get_syscall_map)
+
+
+/*
+ * void unsigned long  __kernel_get_tbfreq(void);
+ *
+ * returns the timebase frequency in HZ
+ */
+V_FUNCTION_BEGIN(__kernel_get_tbfreq)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+	bl	V_LOCAL_FUNC(__get_datapage)
+	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
+	mtlr	r12
+	crclr	cr0*4+so
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_get_tbfreq)
diff --git a/arch/powerpc/kernel/vdso64/getcpu.S b/arch/powerpc/kernel/vdso64/getcpu.S
new file mode 100644
index 000000000..23eb9a944
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/getcpu.S
@@ -0,0 +1,45 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+
+	.text
+/*
+ * Exact prototype of getcpu
+ *
+ * int __kernel_getcpu(unsigned *cpu, unsigned *node);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_getcpu)
+  .cfi_startproc
+	mfspr	r5,SPRN_SPRG_VDSO_READ
+	cmpdi	cr0,r3,0
+	cmpdi	cr1,r4,0
+	clrlwi  r6,r5,16
+	rlwinm  r7,r5,16,31-15,31-0
+	beq	cr0,1f
+	stw	r6,0(r3)
+1:	beq	cr1,2f
+	stw	r7,0(r4)
+2:	crclr	cr0*4+so
+	li	r3,0			/* always success */
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_getcpu)
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
new file mode 100644
index 000000000..020e90d10
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -0,0 +1,293 @@
+/*
+ * Userland implementation of gettimeofday() for 64 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org),
+ *                    IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+#include <asm/asm-offsets.h>
+#include <asm/unistd.h>
+
+	.text
+/*
+ * Exact prototype of gettimeofday
+ *
+ * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_gettimeofday)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r11,r3			/* r11 holds tv */
+	mr	r10,r4			/* r10 holds tz */
+	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	cmpldi	r11,0			/* check if tv is NULL */
+	beq	2f
+	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
+	addi	r7,r7,1000000@l
+	bl	V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */
+	std	r4,TVAL64_TV_SEC(r11)	/* store sec in tv */
+	std	r5,TVAL64_TV_USEC(r11)	/* store usec in tv */
+2:	cmpldi	r10,0			/* check if tz is NULL */
+	beq	1f
+	lwz	r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */
+	lwz	r5,CFG_TZ_DSTTIME(r3)
+	stw	r4,TZONE_TZ_MINWEST(r10)
+	stw	r5,TZONE_TZ_DSTTIME(r10)
+1:	mtlr	r12
+	crclr	cr0*4+so
+	li	r3,0			/* always success */
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_gettimeofday)
+
+
+/*
+ * Exact prototype of clock_gettime()
+ *
+ * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_clock_gettime)
+  .cfi_startproc
+	/* Check for supported clock IDs */
+	cmpwi	cr0,r3,CLOCK_REALTIME
+	cmpwi	cr1,r3,CLOCK_MONOTONIC
+	cror	cr0*4+eq,cr0*4+eq,cr1*4+eq
+
+	cmpwi	cr5,r3,CLOCK_REALTIME_COARSE
+	cmpwi	cr6,r3,CLOCK_MONOTONIC_COARSE
+	cror	cr5*4+eq,cr5*4+eq,cr6*4+eq
+
+	cror	cr0*4+eq,cr0*4+eq,cr5*4+eq
+	bne	cr0,99f
+
+	mflr	r12			/* r12 saves lr */
+  .cfi_register lr,r12
+	mr	r11,r4			/* r11 saves tp */
+	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
+	ori	r7,r7,NSEC_PER_SEC@l
+	beq	cr5,70f
+50:	bl	V_LOCAL_FUNC(__do_get_tspec)	/* get time from tb & kernel */
+	bne	cr1,80f			/* if not monotonic, all done */
+
+	/*
+	 * CLOCK_MONOTONIC
+	 */
+
+	/* now we must fixup using wall to monotonic. We need to snapshot
+	 * that value and do the counter trick again. Fortunately, we still
+	 * have the counter value in r8 that was returned by __do_get_tspec.
+	 * At this point, r4,r5 contain our sec/nsec values.
+	 */
+
+	ld	r6,WTOM_CLOCK_SEC(r3)
+	lwa	r9,WTOM_CLOCK_NSEC(r3)
+
+	/* We now have our result in r6,r9. We create a fake dependency
+	 * on that result and re-check the counter
+	 */
+	or	r0,r6,r9
+	xor	r0,r0,r0
+	add	r3,r3,r0
+	ld	r0,CFG_TB_UPDATE_COUNT(r3)
+        cmpld   cr0,r0,r8		/* check if updated */
+	bne-	50b
+	b	78f
+
+	/*
+	 * For coarse clocks we get data directly from the vdso data page, so
+	 * we don't need to call __do_get_tspec, but we still need to do the
+	 * counter trick.
+	 */
+70:	ld      r8,CFG_TB_UPDATE_COUNT(r3)
+	andi.   r0,r8,1                 /* pending update ? loop */
+	bne-    70b
+	add     r3,r3,r0		/* r0 is already 0 */
+
+	/*
+	 * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
+	 * too
+	 */
+	ld      r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
+	ld      r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
+	bne     cr6,75f
+
+	/* CLOCK_MONOTONIC_COARSE */
+	ld	r6,WTOM_CLOCK_SEC(r3)
+	lwa     r9,WTOM_CLOCK_NSEC(r3)
+
+	/* check if counter has updated */
+	or      r0,r6,r9
+75:	or	r0,r0,r4
+	or	r0,r0,r5
+	xor     r0,r0,r0
+	add     r3,r3,r0
+	ld      r0,CFG_TB_UPDATE_COUNT(r3)
+	cmpld   cr0,r0,r8               /* check if updated */
+	bne-    70b
+
+	/* Counter has not updated, so continue calculating proper values for
+	 * sec and nsec if monotonic coarse, or just return with the proper
+	 * values for realtime.
+	 */
+	bne     cr6,80f
+
+	/* Add wall->monotonic offset and check for overflow or underflow */
+78:	add     r4,r4,r6
+	add     r5,r5,r9
+	cmpd    cr0,r5,r7
+	cmpdi   cr1,r5,0
+	blt     79f
+	subf    r5,r7,r5
+	addi    r4,r4,1
+79:	bge     cr1,80f
+	addi    r4,r4,-1
+	add     r5,r5,r7
+
+80:	std	r4,TSPC64_TV_SEC(r11)
+	std	r5,TSPC64_TV_NSEC(r11)
+
+	mtlr	r12
+	crclr	cr0*4+so
+	li	r3,0
+	blr
+
+	/*
+	 * syscall fallback
+	 */
+99:
+	li	r0,__NR_clock_gettime
+  .cfi_restore lr
+	sc
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_clock_gettime)
+
+
+/*
+ * Exact prototype of clock_getres()
+ *
+ * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_clock_getres)
+  .cfi_startproc
+	/* Check for supported clock IDs */
+	cmpwi	cr0,r3,CLOCK_REALTIME
+	cmpwi	cr1,r3,CLOCK_MONOTONIC
+	cror	cr0*4+eq,cr0*4+eq,cr1*4+eq
+	bne	cr0,99f
+
+	mflr	r12
+  .cfi_register lr,r12
+	bl	V_LOCAL_FUNC(__get_datapage)
+	lwz	r5, CLOCK_HRTIMER_RES(r3)
+	mtlr	r12
+	li	r3,0
+	cmpldi	cr0,r4,0
+	crclr	cr0*4+so
+	beqlr
+	std	r3,TSPC64_TV_SEC(r4)
+	std	r5,TSPC64_TV_NSEC(r4)
+	blr
+
+	/*
+	 * syscall fallback
+	 */
+99:
+	li	r0,__NR_clock_getres
+	sc
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_clock_getres)
+
+/*
+ * Exact prototype of time()
+ *
+ * time_t time(time *t);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_time)
+  .cfi_startproc
+	mflr	r12
+  .cfi_register lr,r12
+
+	mr	r11,r3			/* r11 holds t */
+	bl	V_LOCAL_FUNC(__get_datapage)
+
+	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
+
+	cmpldi	r11,0			/* check if t is NULL */
+	beq	2f
+	std	r4,0(r11)		/* store result at *t */
+2:	mtlr	r12
+	crclr	cr0*4+so
+	mr	r3,r4
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__kernel_time)
+
+
+/*
+ * This is the core of clock_gettime() and gettimeofday(),
+ * it returns the current time in r4 (seconds) and r5.
+ * On entry, r7 gives the resolution of r5, either USEC_PER_SEC
+ * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds.
+ * It expects the datapage ptr in r3 and doesn't clobber it.
+ * It clobbers r0, r6 and r9.
+ * On return, r8 contains the counter value that can be reused.
+ * This clobbers cr0 but not any other cr field.
+ */
+V_FUNCTION_BEGIN(__do_get_tspec)
+  .cfi_startproc
+	/* check for update count & load values */
+1:	ld	r8,CFG_TB_UPDATE_COUNT(r3)
+	andi.	r0,r8,1			/* pending update ? loop */
+	bne-	1b
+	xor	r0,r8,r8		/* create dependency */
+	add	r3,r3,r0
+
+	/* Get TB & offset it. We use the MFTB macro which will generate
+	 * workaround code for Cell.
+	 */
+	MFTB(r6)
+	ld	r9,CFG_TB_ORIG_STAMP(r3)
+	subf	r6,r9,r6
+
+	/* Scale result */
+	ld	r5,CFG_TB_TO_XS(r3)
+	sldi	r6,r6,12		/* compute time since stamp_xtime */
+	mulhdu	r6,r6,r5		/* in units of 2^-32 seconds */
+
+	/* Add stamp since epoch */
+	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
+	lwz	r5,STAMP_SEC_FRAC(r3)
+	or	r0,r4,r5
+	or	r0,r0,r6
+	xor	r0,r0,r0
+	add	r3,r3,r0
+	ld	r0,CFG_TB_UPDATE_COUNT(r3)
+	cmpld   r0,r8			/* check if updated */
+	bne-	1b			/* reload if so */
+
+	/* convert to seconds & nanoseconds and add to stamp */
+	add	r6,r6,r5		/* add on fractional seconds of xtime */
+	mulhwu	r5,r6,r7		/* compute micro or nanoseconds and */
+	srdi	r6,r6,32		/* seconds since stamp_xtime */
+	clrldi	r5,r5,32
+	add	r4,r4,r6
+	blr
+  .cfi_endproc
+V_FUNCTION_END(__do_get_tspec)
diff --git a/arch/powerpc/kernel/vdso64/note.S b/arch/powerpc/kernel/vdso64/note.S
new file mode 100644
index 000000000..dc2a509f7
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/note.S
@@ -0,0 +1 @@
+#include "../vdso32/note.S"
diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
new file mode 100644
index 000000000..542c6f422
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/sigtramp.S
@@ -0,0 +1,311 @@
+/*
+ * Signal trampoline for 64 bits processes in a ppc64 kernel for
+ * use in the vDSO
+ *
+ * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
+ * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/unistd.h>
+#include <asm/vdso.h>
+#include <asm/ptrace.h>		/* XXX for __SIGNAL_FRAMESIZE */
+
+	.text
+
+/* The nop here is a hack.  The dwarf2 unwind routines subtract 1 from
+   the return address to get an address in the middle of the presumed
+   call instruction.  Since we don't have a call here, we artificially
+   extend the range covered by the unwind info by padding before the
+   real start.  */
+	nop
+	.balign 8
+V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
+.Lsigrt_start = . - 4
+	addi	r1, r1, __SIGNAL_FRAMESIZE
+	li	r0,__NR_rt_sigreturn
+	sc
+.Lsigrt_end:
+V_FUNCTION_END(__kernel_sigtramp_rt64)
+/* The ".balign 8" above and the following zeros mimic the old stack
+   trampoline layout.  The last magic value is the ucontext pointer,
+   chosen in such a way that older libgcc unwind code returns a zero
+   for a sigcontext pointer.  */
+	.long 0,0,0
+	.quad 0,-21*8
+
+/* Register r1 can be found at offset 8 of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define cfa_save \
+  .byte 0x0f;			/* DW_CFA_def_cfa_expression */		\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 RSIZE;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+9:
+
+/* Register REGNO can be found at offset OFS of a pt_regs structure.
+   A pointer to the pt_regs is stored in memory at the old sp plus PTREGS.  */
+#define rsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .ifne ofs;								\
+    .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+  .endif;								\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  A pointer to the VMX reg struct is at VREGS in
+   the pt_regs struct.  This macro is for REGNO == 0, and contains
+   'subroutines' that the other macros jump to.  */
+#define vsave_msr0(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit0 */			\
+2:									\
+  .byte 0x40;			/*     DW_OP_lit16 */			\
+  .byte 0x1e;			/*     DW_OP_mul */			\
+3:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x12;			/*     DW_OP_dup */			\
+  .byte 0x23;			/*     DW_OP_plus_uconst */		\
+    .uleb128 33*RSIZE;		/*       msr offset */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x0c; .long 1 << 25;	/*     DW_OP_const4u */			\
+  .byte 0x1a;			/*     DW_OP_and */			\
+  .byte 0x12;			/*     DW_OP_dup, ret 0 if bra taken */	\
+  .byte 0x30;			/*     DW_OP_lit0 */			\
+  .byte 0x29;			/*     DW_OP_eq */			\
+  .byte 0x28; .short 0x7fff;	/*     DW_OP_bra to end */		\
+  .byte 0x13;			/*     DW_OP_drop, pop the 0 */		\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x22;			/*     DW_OP_plus */			\
+  .byte 0x2f; .short 0x7fff;	/*     DW_OP_skip to end */		\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16
+   of the VMX reg struct.  REGNO is 1 thru 31.  */
+#define vsave_msr1(regno) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x30 + regno;		/*     DW_OP_lit n */			\
+  .byte 0x2f; .short 2b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of
+   the VMX save block.  */
+#define vsave_msr2(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x0a; .short ofs;	/*     DW_OP_const2u */			\
+  .byte 0x2f; .short 3b - 9f;	/*     DW_OP_skip */			\
+9:
+
+/* VMX register REGNO is at offset OFS of the VMX save area.  */
+#define vsave(regno, ofs) \
+  .byte 0x10;			/* DW_CFA_expression */			\
+  .uleb128 regno + 77;		/*   regno */				\
+  .uleb128 9f - 1f;		/*   length */				\
+1:									\
+  .byte 0x71; .sleb128 PTREGS;	/*     DW_OP_breg1 */			\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 VREGS;	/*     DW_OP_plus_uconst */		\
+  .byte 0x06;			/*     DW_OP_deref */			\
+  .byte 0x23; .uleb128 ofs;	/*     DW_OP_plus_uconst */		\
+9:
+
+/* This is where the pt_regs pointer can be found on the stack.  */
+#define PTREGS	128+168+56
+
+/* Size of regs.  */
+#define RSIZE	8
+
+/* Size of CR reg in DWARF unwind info. */
+#define CRSIZE	4
+
+/* Offset of CR reg within a full word. */
+#ifdef __LITTLE_ENDIAN__
+#define CROFF 0
+#else
+#define CROFF (RSIZE - CRSIZE)
+#endif
+
+/* This is the offset of the VMX reg pointer.  */
+#define VREGS	48*RSIZE+33*8
+
+/* Describe where general purpose regs are saved.  */
+#define EH_FRAME_GEN \
+  cfa_save;								\
+  rsave ( 0,  0*RSIZE);							\
+  rsave ( 2,  2*RSIZE);							\
+  rsave ( 3,  3*RSIZE);							\
+  rsave ( 4,  4*RSIZE);							\
+  rsave ( 5,  5*RSIZE);							\
+  rsave ( 6,  6*RSIZE);							\
+  rsave ( 7,  7*RSIZE);							\
+  rsave ( 8,  8*RSIZE);							\
+  rsave ( 9,  9*RSIZE);							\
+  rsave (10, 10*RSIZE);							\
+  rsave (11, 11*RSIZE);							\
+  rsave (12, 12*RSIZE);							\
+  rsave (13, 13*RSIZE);							\
+  rsave (14, 14*RSIZE);							\
+  rsave (15, 15*RSIZE);							\
+  rsave (16, 16*RSIZE);							\
+  rsave (17, 17*RSIZE);							\
+  rsave (18, 18*RSIZE);							\
+  rsave (19, 19*RSIZE);							\
+  rsave (20, 20*RSIZE);							\
+  rsave (21, 21*RSIZE);							\
+  rsave (22, 22*RSIZE);							\
+  rsave (23, 23*RSIZE);							\
+  rsave (24, 24*RSIZE);							\
+  rsave (25, 25*RSIZE);							\
+  rsave (26, 26*RSIZE);							\
+  rsave (27, 27*RSIZE);							\
+  rsave (28, 28*RSIZE);							\
+  rsave (29, 29*RSIZE);							\
+  rsave (30, 30*RSIZE);							\
+  rsave (31, 31*RSIZE);							\
+  rsave (67, 32*RSIZE);		/* ap, used as temp for nip */		\
+  rsave (65, 36*RSIZE);		/* lr */				\
+  rsave (68, 38*RSIZE + CROFF);	/* cr fields */				\
+  rsave (69, 38*RSIZE + CROFF);						\
+  rsave (70, 38*RSIZE + CROFF);						\
+  rsave (71, 38*RSIZE + CROFF);						\
+  rsave (72, 38*RSIZE + CROFF);						\
+  rsave (73, 38*RSIZE + CROFF);						\
+  rsave (74, 38*RSIZE + CROFF);						\
+  rsave (75, 38*RSIZE + CROFF)
+
+/* Describe where the FP regs are saved.  */
+#define EH_FRAME_FP \
+  rsave (32, 48*RSIZE +  0*8);						\
+  rsave (33, 48*RSIZE +  1*8);						\
+  rsave (34, 48*RSIZE +  2*8);						\
+  rsave (35, 48*RSIZE +  3*8);						\
+  rsave (36, 48*RSIZE +  4*8);						\
+  rsave (37, 48*RSIZE +  5*8);						\
+  rsave (38, 48*RSIZE +  6*8);						\
+  rsave (39, 48*RSIZE +  7*8);						\
+  rsave (40, 48*RSIZE +  8*8);						\
+  rsave (41, 48*RSIZE +  9*8);						\
+  rsave (42, 48*RSIZE + 10*8);						\
+  rsave (43, 48*RSIZE + 11*8);						\
+  rsave (44, 48*RSIZE + 12*8);						\
+  rsave (45, 48*RSIZE + 13*8);						\
+  rsave (46, 48*RSIZE + 14*8);						\
+  rsave (47, 48*RSIZE + 15*8);						\
+  rsave (48, 48*RSIZE + 16*8);						\
+  rsave (49, 48*RSIZE + 17*8);						\
+  rsave (50, 48*RSIZE + 18*8);						\
+  rsave (51, 48*RSIZE + 19*8);						\
+  rsave (52, 48*RSIZE + 20*8);						\
+  rsave (53, 48*RSIZE + 21*8);						\
+  rsave (54, 48*RSIZE + 22*8);						\
+  rsave (55, 48*RSIZE + 23*8);						\
+  rsave (56, 48*RSIZE + 24*8);						\
+  rsave (57, 48*RSIZE + 25*8);						\
+  rsave (58, 48*RSIZE + 26*8);						\
+  rsave (59, 48*RSIZE + 27*8);						\
+  rsave (60, 48*RSIZE + 28*8);						\
+  rsave (61, 48*RSIZE + 29*8);						\
+  rsave (62, 48*RSIZE + 30*8);						\
+  rsave (63, 48*RSIZE + 31*8)
+
+/* Describe where the VMX regs are saved.  */
+#ifdef CONFIG_ALTIVEC
+#define EH_FRAME_VMX \
+  vsave_msr0 ( 0);							\
+  vsave_msr1 ( 1);							\
+  vsave_msr1 ( 2);							\
+  vsave_msr1 ( 3);							\
+  vsave_msr1 ( 4);							\
+  vsave_msr1 ( 5);							\
+  vsave_msr1 ( 6);							\
+  vsave_msr1 ( 7);							\
+  vsave_msr1 ( 8);							\
+  vsave_msr1 ( 9);							\
+  vsave_msr1 (10);							\
+  vsave_msr1 (11);							\
+  vsave_msr1 (12);							\
+  vsave_msr1 (13);							\
+  vsave_msr1 (14);							\
+  vsave_msr1 (15);							\
+  vsave_msr1 (16);							\
+  vsave_msr1 (17);							\
+  vsave_msr1 (18);							\
+  vsave_msr1 (19);							\
+  vsave_msr1 (20);							\
+  vsave_msr1 (21);							\
+  vsave_msr1 (22);							\
+  vsave_msr1 (23);							\
+  vsave_msr1 (24);							\
+  vsave_msr1 (25);							\
+  vsave_msr1 (26);							\
+  vsave_msr1 (27);							\
+  vsave_msr1 (28);							\
+  vsave_msr1 (29);							\
+  vsave_msr1 (30);							\
+  vsave_msr1 (31);							\
+  vsave_msr2 (33, 32*16+12);						\
+  vsave      (32, 33*16)
+#else
+#define EH_FRAME_VMX
+#endif
+
+	.section .eh_frame,"a",@progbits
+.Lcie:
+	.long .Lcie_end - .Lcie_start
+.Lcie_start:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zRS"		/* NUL-terminated augmentation string */
+	.uleb128 4		/* Code alignment factor */
+	.sleb128 -8		/* Data alignment factor */
+	.byte 67		/* Return address register column, ap */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x14		/* DW_EH_PE_pcrel | DW_EH_PE_udata8. */
+	.byte 0x0c,1,0		/* DW_CFA_def_cfa: r1 ofs 0 */
+	.balign 8
+.Lcie_end:
+
+	.long .Lfde0_end - .Lfde0_start
+.Lfde0_start:
+	.long .Lfde0_start - .Lcie	/* CIE pointer. */
+	.quad .Lsigrt_start - .		/* PC start, length */
+	.quad .Lsigrt_end - .Lsigrt_start
+	.uleb128 0			/* Augmentation */
+	EH_FRAME_GEN
+	EH_FRAME_FP
+	EH_FRAME_VMX
+# Do we really need to describe the frame at this point?  ie. will
+# we ever have some call chain that returns somewhere past the addi?
+# I don't think so, since gcc doesn't support async signals.
+#	.byte 0x41		/* DW_CFA_advance_loc 1*4 */
+#undef PTREGS
+#define PTREGS 168+56
+#	EH_FRAME_GEN
+#	EH_FRAME_FP
+#	EH_FRAME_VMX
+	.balign 8
+.Lfde0_end:
diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S
new file mode 100644
index 000000000..256fb9720
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+#include <asm/vdso.h>
+
+#ifdef __LITTLE_ENDIAN__
+OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle")
+#else
+OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc")
+#endif
+OUTPUT_ARCH(powerpc:common64)
+ENTRY(_start)
+
+SECTIONS
+{
+	. = VDSO64_LBASE + SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+
+	. = ALIGN(16);
+	.text		: {
+		*(.text .stub .text.* .gnu.linkonce.t.* __ftr_alt_*)
+		*(.sfpr .glink)
+	}						:text
+	PROVIDE(__etext = .);
+	PROVIDE(_etext = .);
+	PROVIDE(etext = .);
+
+	. = ALIGN(8);
+	__ftr_fixup	: { *(__ftr_fixup) }
+
+	. = ALIGN(8);
+	__mmu_ftr_fixup	: { *(__mmu_ftr_fixup) }
+
+	. = ALIGN(8);
+	__lwsync_fixup	: { *(__lwsync_fixup) }
+
+	. = ALIGN(8);
+	__fw_ftr_fixup	: { *(__fw_ftr_fixup) }
+
+	/*
+	 * Other stuff is appended to the text segment:
+	 */
+	.rodata		: { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+	.rodata1	: { *(.rodata1) }
+
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+	.gcc_except_table : { *(.gcc_except_table) }
+	.rela.dyn ALIGN(8) : { *(.rela.dyn) }
+
+	.opd ALIGN(8)	: { KEEP (*(.opd)) }
+	.got ALIGN(8)	: { *(.got .toc) }
+
+	_end = .;
+	PROVIDE(end = .);
+
+	/*
+	 * Stabs debugging sections are here too.
+	 */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+
+	/*
+	 * DWARF debug sections.
+	 * Symbols in the DWARF debugging sections are relative to the beginning
+	 * of the section so we begin them at 0.
+	 */
+	/* DWARF 1 */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2 */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2 */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+
+	/DISCARD/	: {
+		*(.note.GNU-stack)
+		*(.branch_lt)
+		*(.data .data.* .gnu.linkonce.d.* .sdata*)
+		*(.bss .sbss .dynbss .dynsbss)
+	}
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME	0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD FILEHDR PHDRS FLAGS(5);	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC FLAGS(4);		/* PF_R */
+	note		PT_NOTE FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		/*
+		 * Has to be there for the kernel to find
+		 */
+		__kernel_datapage_offset;
+
+		__kernel_get_syscall_map;
+		__kernel_gettimeofday;
+		__kernel_clock_gettime;
+		__kernel_clock_getres;
+		__kernel_get_tbfreq;
+		__kernel_sync_dicache;
+		__kernel_sync_dicache_p5;
+		__kernel_sigtramp_rt64;
+		__kernel_getcpu;
+		__kernel_time;
+
+	local: *;
+	};
+}
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
new file mode 100644
index 000000000..1d56d81fe
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.globl vdso64_start, vdso64_end
+	.balign PAGE_SIZE
+vdso64_start:
+	.incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg"
+	.balign PAGE_SIZE
+vdso64_end:
+
+	.previous
diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c
new file mode 100644
index 000000000..4acd3fb2b
--- /dev/null
+++ b/arch/powerpc/kernel/vecemu.c
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Routines to emulate some Altivec/VMX instructions, specifically
+ * those that can trap when given denormalized operands in Java mode.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/processor.h>
+#include <asm/switch_to.h>
+#include <linux/uaccess.h>
+
+/* Functions in vector.S */
+extern void vaddfp(vector128 *dst, vector128 *a, vector128 *b);
+extern void vsubfp(vector128 *dst, vector128 *a, vector128 *b);
+extern void vmaddfp(vector128 *dst, vector128 *a, vector128 *b, vector128 *c);
+extern void vnmsubfp(vector128 *dst, vector128 *a, vector128 *b, vector128 *c);
+extern void vrefp(vector128 *dst, vector128 *src);
+extern void vrsqrtefp(vector128 *dst, vector128 *src);
+extern void vexptep(vector128 *dst, vector128 *src);
+
+static unsigned int exp2s[8] = {
+	0x800000,
+	0x8b95c2,
+	0x9837f0,
+	0xa5fed7,
+	0xb504f3,
+	0xc5672a,
+	0xd744fd,
+	0xeac0c7
+};
+
+/*
+ * Computes an estimate of 2^x.  The `s' argument is the 32-bit
+ * single-precision floating-point representation of x.
+ */
+static unsigned int eexp2(unsigned int s)
+{
+	int exp, pwr;
+	unsigned int mant, frac;
+
+	/* extract exponent field from input */
+	exp = ((s >> 23) & 0xff) - 127;
+	if (exp > 7) {
+		/* check for NaN input */
+		if (exp == 128 && (s & 0x7fffff) != 0)
+			return s | 0x400000;	/* return QNaN */
+		/* 2^-big = 0, 2^+big = +Inf */
+		return (s & 0x80000000)? 0: 0x7f800000;	/* 0 or +Inf */
+	}
+	if (exp < -23)
+		return 0x3f800000;	/* 1.0 */
+
+	/* convert to fixed point integer in 9.23 representation */
+	pwr = (s & 0x7fffff) | 0x800000;
+	if (exp > 0)
+		pwr <<= exp;
+	else
+		pwr >>= -exp;
+	if (s & 0x80000000)
+		pwr = -pwr;
+
+	/* extract integer part, which becomes exponent part of result */
+	exp = (pwr >> 23) + 126;
+	if (exp >= 254)
+		return 0x7f800000;
+	if (exp < -23)
+		return 0;
+
+	/* table lookup on top 3 bits of fraction to get mantissa */
+	mant = exp2s[(pwr >> 20) & 7];
+
+	/* linear interpolation using remaining 20 bits of fraction */
+	asm("mulhwu %0,%1,%2" : "=r" (frac)
+	    : "r" (pwr << 12), "r" (0x172b83ff));
+	asm("mulhwu %0,%1,%2" : "=r" (frac) : "r" (frac), "r" (mant));
+	mant += frac;
+
+	if (exp >= 0)
+		return mant + (exp << 23);
+
+	/* denormalized result */
+	exp = -exp;
+	mant += 1 << (exp - 1);
+	return mant >> exp;
+}
+
+/*
+ * Computes an estimate of log_2(x).  The `s' argument is the 32-bit
+ * single-precision floating-point representation of x.
+ */
+static unsigned int elog2(unsigned int s)
+{
+	int exp, mant, lz, frac;
+
+	exp = s & 0x7f800000;
+	mant = s & 0x7fffff;
+	if (exp == 0x7f800000) {	/* Inf or NaN */
+		if (mant != 0)
+			s |= 0x400000;	/* turn NaN into QNaN */
+		return s;
+	}
+	if ((exp | mant) == 0)		/* +0 or -0 */
+		return 0xff800000;	/* return -Inf */
+
+	if (exp == 0) {
+		/* denormalized */
+		asm("cntlzw %0,%1" : "=r" (lz) : "r" (mant));
+		mant <<= lz - 8;
+		exp = (-118 - lz) << 23;
+	} else {
+		mant |= 0x800000;
+		exp -= 127 << 23;
+	}
+
+	if (mant >= 0xb504f3) {				/* 2^0.5 * 2^23 */
+		exp |= 0x400000;			/* 0.5 * 2^23 */
+		asm("mulhwu %0,%1,%2" : "=r" (mant)
+		    : "r" (mant), "r" (0xb504f334));	/* 2^-0.5 * 2^32 */
+	}
+	if (mant >= 0x9837f0) {				/* 2^0.25 * 2^23 */
+		exp |= 0x200000;			/* 0.25 * 2^23 */
+		asm("mulhwu %0,%1,%2" : "=r" (mant)
+		    : "r" (mant), "r" (0xd744fccb));	/* 2^-0.25 * 2^32 */
+	}
+	if (mant >= 0x8b95c2) {				/* 2^0.125 * 2^23 */
+		exp |= 0x100000;			/* 0.125 * 2^23 */
+		asm("mulhwu %0,%1,%2" : "=r" (mant)
+		    : "r" (mant), "r" (0xeac0c6e8));	/* 2^-0.125 * 2^32 */
+	}
+	if (mant > 0x800000) {				/* 1.0 * 2^23 */
+		/* calculate (mant - 1) * 1.381097463 */
+		/* 1.381097463 == 0.125 / (2^0.125 - 1) */
+		asm("mulhwu %0,%1,%2" : "=r" (frac)
+		    : "r" ((mant - 0x800000) << 1), "r" (0xb0c7cd3a));
+		exp += frac;
+	}
+	s = exp & 0x80000000;
+	if (exp != 0) {
+		if (s)
+			exp = -exp;
+		asm("cntlzw %0,%1" : "=r" (lz) : "r" (exp));
+		lz = 8 - lz;
+		if (lz > 0)
+			exp >>= lz;
+		else if (lz < 0)
+			exp <<= -lz;
+		s += ((lz + 126) << 23) + exp;
+	}
+	return s;
+}
+
+#define VSCR_SAT	1
+
+static int ctsxs(unsigned int x, int scale, unsigned int *vscrp)
+{
+	int exp, mant;
+
+	exp = (x >> 23) & 0xff;
+	mant = x & 0x7fffff;
+	if (exp == 255 && mant != 0)
+		return 0;		/* NaN -> 0 */
+	exp = exp - 127 + scale;
+	if (exp < 0)
+		return 0;		/* round towards zero */
+	if (exp >= 31) {
+		/* saturate, unless the result would be -2^31 */
+		if (x + (scale << 23) != 0xcf000000)
+			*vscrp |= VSCR_SAT;
+		return (x & 0x80000000)? 0x80000000: 0x7fffffff;
+	}
+	mant |= 0x800000;
+	mant = (mant << 7) >> (30 - exp);
+	return (x & 0x80000000)? -mant: mant;
+}
+
+static unsigned int ctuxs(unsigned int x, int scale, unsigned int *vscrp)
+{
+	int exp;
+	unsigned int mant;
+
+	exp = (x >> 23) & 0xff;
+	mant = x & 0x7fffff;
+	if (exp == 255 && mant != 0)
+		return 0;		/* NaN -> 0 */
+	exp = exp - 127 + scale;
+	if (exp < 0)
+		return 0;		/* round towards zero */
+	if (x & 0x80000000) {
+		/* negative => saturate to 0 */
+		*vscrp |= VSCR_SAT;
+		return 0;
+	}
+	if (exp >= 32) {
+		/* saturate */
+		*vscrp |= VSCR_SAT;
+		return 0xffffffff;
+	}
+	mant |= 0x800000;
+	mant = (mant << 8) >> (31 - exp);
+	return mant;
+}
+
+/* Round to floating integer, towards 0 */
+static unsigned int rfiz(unsigned int x)
+{
+	int exp;
+
+	exp = ((x >> 23) & 0xff) - 127;
+	if (exp == 128 && (x & 0x7fffff) != 0)
+		return x | 0x400000;	/* NaN -> make it a QNaN */
+	if (exp >= 23)
+		return x;		/* it's an integer already (or Inf) */
+	if (exp < 0)
+		return x & 0x80000000;	/* |x| < 1.0 rounds to 0 */
+	return x & ~(0x7fffff >> exp);
+}
+
+/* Round to floating integer, towards +/- Inf */
+static unsigned int rfii(unsigned int x)
+{
+	int exp, mask;
+
+	exp = ((x >> 23) & 0xff) - 127;
+	if (exp == 128 && (x & 0x7fffff) != 0)
+		return x | 0x400000;	/* NaN -> make it a QNaN */
+	if (exp >= 23)
+		return x;		/* it's an integer already (or Inf) */
+	if ((x & 0x7fffffff) == 0)
+		return x;		/* +/-0 -> +/-0 */
+	if (exp < 0)
+		/* 0 < |x| < 1.0 rounds to +/- 1.0 */
+		return (x & 0x80000000) | 0x3f800000;
+	mask = 0x7fffff >> exp;
+	/* mantissa overflows into exponent - that's OK,
+	   it can't overflow into the sign bit */
+	return (x + mask) & ~mask;
+}
+
+/* Round to floating integer, to nearest */
+static unsigned int rfin(unsigned int x)
+{
+	int exp, half;
+
+	exp = ((x >> 23) & 0xff) - 127;
+	if (exp == 128 && (x & 0x7fffff) != 0)
+		return x | 0x400000;	/* NaN -> make it a QNaN */
+	if (exp >= 23)
+		return x;		/* it's an integer already (or Inf) */
+	if (exp < -1)
+		return x & 0x80000000;	/* |x| < 0.5 -> +/-0 */
+	if (exp == -1)
+		/* 0.5 <= |x| < 1.0 rounds to +/- 1.0 */
+		return (x & 0x80000000) | 0x3f800000;
+	half = 0x400000 >> exp;
+	/* add 0.5 to the magnitude and chop off the fraction bits */
+	return (x + half) & ~(0x7fffff >> exp);
+}
+
+int emulate_altivec(struct pt_regs *regs)
+{
+	unsigned int instr, i;
+	unsigned int va, vb, vc, vd;
+	vector128 *vrs;
+
+	if (get_user(instr, (unsigned int __user *) regs->nip))
+		return -EFAULT;
+	if ((instr >> 26) != 4)
+		return -EINVAL;		/* not an altivec instruction */
+	vd = (instr >> 21) & 0x1f;
+	va = (instr >> 16) & 0x1f;
+	vb = (instr >> 11) & 0x1f;
+	vc = (instr >> 6) & 0x1f;
+
+	vrs = current->thread.vr_state.vr;
+	switch (instr & 0x3f) {
+	case 10:
+		switch (vc) {
+		case 0:	/* vaddfp */
+			vaddfp(&vrs[vd], &vrs[va], &vrs[vb]);
+			break;
+		case 1:	/* vsubfp */
+			vsubfp(&vrs[vd], &vrs[va], &vrs[vb]);
+			break;
+		case 4:	/* vrefp */
+			vrefp(&vrs[vd], &vrs[vb]);
+			break;
+		case 5:	/* vrsqrtefp */
+			vrsqrtefp(&vrs[vd], &vrs[vb]);
+			break;
+		case 6:	/* vexptefp */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = eexp2(vrs[vb].u[i]);
+			break;
+		case 7:	/* vlogefp */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = elog2(vrs[vb].u[i]);
+			break;
+		case 8:		/* vrfin */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = rfin(vrs[vb].u[i]);
+			break;
+		case 9:		/* vrfiz */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = rfiz(vrs[vb].u[i]);
+			break;
+		case 10:	/* vrfip */
+			for (i = 0; i < 4; ++i) {
+				u32 x = vrs[vb].u[i];
+				x = (x & 0x80000000)? rfiz(x): rfii(x);
+				vrs[vd].u[i] = x;
+			}
+			break;
+		case 11:	/* vrfim */
+			for (i = 0; i < 4; ++i) {
+				u32 x = vrs[vb].u[i];
+				x = (x & 0x80000000)? rfii(x): rfiz(x);
+				vrs[vd].u[i] = x;
+			}
+			break;
+		case 14:	/* vctuxs */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = ctuxs(vrs[vb].u[i], va,
+					&current->thread.vr_state.vscr.u[3]);
+			break;
+		case 15:	/* vctsxs */
+			for (i = 0; i < 4; ++i)
+				vrs[vd].u[i] = ctsxs(vrs[vb].u[i], va,
+					&current->thread.vr_state.vscr.u[3]);
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case 46:	/* vmaddfp */
+		vmaddfp(&vrs[vd], &vrs[va], &vrs[vb], &vrs[vc]);
+		break;
+	case 47:	/* vnmsubfp */
+		vnmsubfp(&vrs[vd], &vrs[va], &vrs[vb], &vrs[vc]);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
new file mode 100644
index 000000000..21165da00
--- /dev/null
+++ b/arch/powerpc/kernel/vector.S
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/export.h>
+#include <asm/asm-compat.h>
+
+/*
+ * Load state from memory into VMX registers including VSCR.
+ * Assumes the caller has enabled VMX in the MSR.
+ */
+_GLOBAL(load_vr_state)
+	li	r4,VRSTATE_VSCR
+	lvx	v0,r4,r3
+	mtvscr	v0
+	REST_32VRS(0,r4,r3)
+	blr
+EXPORT_SYMBOL(load_vr_state)
+
+/*
+ * Store VMX state into memory, including VSCR.
+ * Assumes the caller has enabled VMX in the MSR.
+ */
+_GLOBAL(store_vr_state)
+	SAVE_32VRS(0, r4, r3)
+	mfvscr	v0
+	li	r4, VRSTATE_VSCR
+	stvx	v0, r4, r3
+	blr
+EXPORT_SYMBOL(store_vr_state)
+
+/*
+ * Disable VMX for the task which had it previously,
+ * and save its vector registers in its thread_struct.
+ * Enables the VMX for use in the kernel on return.
+ * On SMP we know the VMX is free, since we give it up every
+ * switch (ie, no lazy save of the vector registers).
+ *
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
+ */
+_GLOBAL(load_up_altivec)
+	mfmsr	r5			/* grab the current MSR */
+	oris	r5,r5,MSR_VEC@h
+	MTMSRD(r5)			/* enable use of AltiVec now */
+	isync
+
+	/*
+	 * While userspace in general ignores VRSAVE, glibc uses it as a boolean
+	 * to optimise userspace context save/restore. Whenever we take an
+	 * altivec unavailable exception we must set VRSAVE to something non
+	 * zero. Set it to all 1s. See also the programming note in the ISA.
+	 */
+	mfspr	r4,SPRN_VRSAVE
+	cmpwi	0,r4,0
+	bne+	1f
+	li	r4,-1
+	mtspr	SPRN_VRSAVE,r4
+1:
+	/* enable use of VMX after return */
+#ifdef CONFIG_PPC32
+	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
+	oris	r9,r9,MSR_VEC@h
+#else
+	ld	r4,PACACURRENT(r13)
+	addi	r5,r4,THREAD		/* Get THREAD */
+	oris	r12,r12,MSR_VEC@h
+	std	r12,_MSR(r1)
+#endif
+	/* Don't care if r4 overflows, this is desired behaviour */
+	lbz	r4,THREAD_LOAD_VEC(r5)
+	addi	r4,r4,1
+	stb	r4,THREAD_LOAD_VEC(r5)
+	addi	r6,r5,THREAD_VRSTATE
+	li	r4,1
+	li	r10,VRSTATE_VSCR
+	stw	r4,THREAD_USED_VR(r5)
+	lvx	v0,r10,r6
+	mtvscr	v0
+	REST_32VRS(0,r4,r6)
+	/* restore registers and return */
+	blr
+
+/*
+ * save_altivec(tsk)
+ * Save the vector registers to its thread_struct
+ */
+_GLOBAL(save_altivec)
+	addi	r3,r3,THREAD		/* want THREAD of task */
+	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
+	PPC_LL	r5,PT_REGS(r3)
+	PPC_LCMPI	0,r7,0
+	bne	2f
+	addi	r7,r3,THREAD_VRSTATE
+2:	SAVE_32VRS(0,r4,r7)
+	mfvscr	v0
+	li	r4,VRSTATE_VSCR
+	stvx	v0,r4,r7
+	blr
+
+#ifdef CONFIG_VSX
+
+#ifdef CONFIG_PPC32
+#error This asm code isn't ready for 32-bit kernels
+#endif
+
+/*
+ * load_up_vsx(unused, unused, tsk)
+ * Disable VSX for the task which had it previously,
+ * and save its vector registers in its thread_struct.
+ * Reuse the fp and vsx saves, but first check to see if they have
+ * been saved already.
+ */
+_GLOBAL(load_up_vsx)
+/* Load FP and VSX registers if they haven't been done yet */
+	andi.	r5,r12,MSR_FP
+	beql+	load_up_fpu		/* skip if already loaded */
+	andis.	r5,r12,MSR_VEC@h
+	beql+	load_up_altivec		/* skip if already loaded */
+
+	ld	r4,PACACURRENT(r13)
+	addi	r4,r4,THREAD		/* Get THREAD */
+	li	r6,1
+	stw	r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */
+	/* enable use of VSX after return */
+	oris	r12,r12,MSR_VSX@h
+	std	r12,_MSR(r1)
+	b	fast_exception_return
+
+#endif /* CONFIG_VSX */
+
+
+/*
+ * The routines below are in assembler so we can closely control the
+ * usage of floating-point registers.  These routines must be called
+ * with preempt disabled.
+ */
+#ifdef CONFIG_PPC32
+	.data
+fpzero:
+	.long	0
+fpone:
+	.long	0x3f800000	/* 1.0 in single-precision FP */
+fphalf:
+	.long	0x3f000000	/* 0.5 in single-precision FP */
+
+#define LDCONST(fr, name)	\
+	lis	r11,name@ha;	\
+	lfs	fr,name@l(r11)
+#else
+
+	.section ".toc","aw"
+fpzero:
+	.tc	FD_0_0[TC],0
+fpone:
+	.tc	FD_3ff00000_0[TC],0x3ff0000000000000	/* 1.0 */
+fphalf:
+	.tc	FD_3fe00000_0[TC],0x3fe0000000000000	/* 0.5 */
+
+#define LDCONST(fr, name)	\
+	lfd	fr,name@toc(r2)
+#endif
+
+	.text
+/*
+ * Internal routine to enable floating point and set FPSCR to 0.
+ * Don't call it from C; it doesn't use the normal calling convention.
+ */
+fpenable:
+#ifdef CONFIG_PPC32
+	stwu	r1,-64(r1)
+#else
+	stdu	r1,-64(r1)
+#endif
+	mfmsr	r10
+	ori	r11,r10,MSR_FP
+	mtmsr	r11
+	isync
+	stfd	fr0,24(r1)
+	stfd	fr1,16(r1)
+	stfd	fr31,8(r1)
+	LDCONST(fr1, fpzero)
+	mffs	fr31
+	MTFSF_L(fr1)
+	blr
+
+fpdisable:
+	mtlr	r12
+	MTFSF_L(fr31)
+	lfd	fr31,8(r1)
+	lfd	fr1,16(r1)
+	lfd	fr0,24(r1)
+	mtmsr	r10
+	isync
+	addi	r1,r1,64
+	blr
+
+/*
+ * Vector add, floating point.
+ */
+_GLOBAL(vaddfp)
+	mflr	r12
+	bl	fpenable
+	li	r0,4
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	lfsx	fr1,r5,r6
+	fadds	fr0,fr0,fr1
+	stfsx	fr0,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	b	fpdisable
+
+/*
+ * Vector subtract, floating point.
+ */
+_GLOBAL(vsubfp)
+	mflr	r12
+	bl	fpenable
+	li	r0,4
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	lfsx	fr1,r5,r6
+	fsubs	fr0,fr0,fr1
+	stfsx	fr0,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	b	fpdisable
+
+/*
+ * Vector multiply and add, floating point.
+ */
+_GLOBAL(vmaddfp)
+	mflr	r12
+	bl	fpenable
+	stfd	fr2,32(r1)
+	li	r0,4
+	mtctr	r0
+	li	r7,0
+1:	lfsx	fr0,r4,r7
+	lfsx	fr1,r5,r7
+	lfsx	fr2,r6,r7
+	fmadds	fr0,fr0,fr2,fr1
+	stfsx	fr0,r3,r7
+	addi	r7,r7,4
+	bdnz	1b
+	lfd	fr2,32(r1)
+	b	fpdisable
+
+/*
+ * Vector negative multiply and subtract, floating point.
+ */
+_GLOBAL(vnmsubfp)
+	mflr	r12
+	bl	fpenable
+	stfd	fr2,32(r1)
+	li	r0,4
+	mtctr	r0
+	li	r7,0
+1:	lfsx	fr0,r4,r7
+	lfsx	fr1,r5,r7
+	lfsx	fr2,r6,r7
+	fnmsubs	fr0,fr0,fr2,fr1
+	stfsx	fr0,r3,r7
+	addi	r7,r7,4
+	bdnz	1b
+	lfd	fr2,32(r1)
+	b	fpdisable
+
+/*
+ * Vector reciprocal estimate.  We just compute 1.0/x.
+ * r3 -> destination, r4 -> source.
+ */
+_GLOBAL(vrefp)
+	mflr	r12
+	bl	fpenable
+	li	r0,4
+	LDCONST(fr1, fpone)
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	fdivs	fr0,fr1,fr0
+	stfsx	fr0,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	b	fpdisable
+
+/*
+ * Vector reciprocal square-root estimate, floating point.
+ * We use the frsqrte instruction for the initial estimate followed
+ * by 2 iterations of Newton-Raphson to get sufficient accuracy.
+ * r3 -> destination, r4 -> source.
+ */
+_GLOBAL(vrsqrtefp)
+	mflr	r12
+	bl	fpenable
+	stfd	fr2,32(r1)
+	stfd	fr3,40(r1)
+	stfd	fr4,48(r1)
+	stfd	fr5,56(r1)
+	li	r0,4
+	LDCONST(fr4, fpone)
+	LDCONST(fr5, fphalf)
+	mtctr	r0
+	li	r6,0
+1:	lfsx	fr0,r4,r6
+	frsqrte	fr1,fr0		/* r = frsqrte(s) */
+	fmuls	fr3,fr1,fr0	/* r * s */
+	fmuls	fr2,fr1,fr5	/* r * 0.5 */
+	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
+	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
+	fmuls	fr3,fr1,fr0	/* r * s */
+	fmuls	fr2,fr1,fr5	/* r * 0.5 */
+	fnmsubs	fr3,fr1,fr3,fr4	/* 1 - s * r * r */
+	fmadds	fr1,fr2,fr3,fr1	/* r = r + 0.5 * r * (1 - s * r * r) */
+	stfsx	fr1,r3,r6
+	addi	r6,r6,4
+	bdnz	1b
+	lfd	fr5,56(r1)
+	lfd	fr4,48(r1)
+	lfd	fr3,40(r1)
+	lfd	fr2,32(r1)
+	b	fpdisable
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
new file mode 100644
index 000000000..9b346f3d2
--- /dev/null
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -0,0 +1,413 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_PPC64
+#define PROVIDE32(x)	PROVIDE(__unused__##x)
+#else
+#define PROVIDE32(x)	PROVIDE(x)
+#endif
+#include <asm/page.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+
+#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32)
+#define STRICT_ALIGN_SIZE	(1 << 24)
+#else
+#define STRICT_ALIGN_SIZE	PAGE_SIZE
+#endif
+
+ENTRY(_stext)
+
+PHDRS {
+	kernel PT_LOAD FLAGS(7); /* RWX */
+	notes PT_NOTE FLAGS(0);
+	dummy PT_NOTE FLAGS(0);
+
+	/* binutils < 2.18 has a bug that makes it misbehave when taking an
+	   ELF file with all segments at load address 0 as input.  This
+	   happens when running "strip" on vmlinux, because of the AT() magic
+	   in this linker script.  People using GCC >= 4.2 won't run into
+	   this problem, because the "build-id" support will put some data
+	   into the "notes" segment (at a non-zero load address).
+
+	   To work around this, we force some data into both the "dummy"
+	   segment and the kernel segment, so the dummy segment will get a
+	   non-zero load address.  It's not enough to always create the
+	   "notes" segment, since if nothing gets assigned to it, its load
+	   address will be zero.  */
+}
+
+#ifdef CONFIG_PPC64
+OUTPUT_ARCH(powerpc:common64)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(powerpc:common)
+jiffies = jiffies_64 + 4;
+#endif
+SECTIONS
+{
+	. = KERNELBASE;
+
+/*
+ * Text, read only data and other permanent read-only sections
+ */
+
+	_text = .;
+	_stext = .;
+
+	/*
+	 * Head text.
+	 * This needs to be in its own output section to avoid ld placing
+	 * branch trampoline stubs randomly throughout the fixed sections,
+	 * which it will do (even if the branch comes from another section)
+	 * in order to optimize stub generation.
+	 */
+	.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {
+#ifdef CONFIG_PPC64
+		KEEP(*(.head.text.first_256B));
+#ifdef CONFIG_PPC_BOOK3E
+#else
+		KEEP(*(.head.text.real_vectors));
+		*(.head.text.real_trampolines);
+		KEEP(*(.head.text.virt_vectors));
+		*(.head.text.virt_trampolines);
+# if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+		KEEP(*(.head.data.fwnmi_page));
+# endif
+#endif
+#else /* !CONFIG_PPC64 */
+		HEAD_TEXT
+#endif
+	} :kernel
+
+	__head_end = .;
+
+#ifdef CONFIG_PPC64
+	/*
+	 * BLOCK(0) overrides the default output section alignment because
+	 * this needs to start right after .head.text in order for fixed
+	 * section placement to work.
+	 */
+	.text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_LD_HEAD_STUB_CATCH
+		KEEP(*(.linker_stub_catch));
+		. = . ;
+#endif
+
+#else
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		ALIGN_FUNCTION();
+#endif
+		/* careful! __ftr_alt_* sections need to be close to .text */
+		*(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text);
+		NOINSTR_TEXT
+		SCHED_TEXT
+		CPUIDLE_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		SOFTIRQENTRY_TEXT
+		/*
+		 * -Os builds call FP save/restore functions. The powerpc64
+		 * linker generates those on demand in the .sfpr section.
+		 * .sfpr gets placed at the beginning of a group of input
+		 * sections, which can break start-of-text offset if it is
+		 * included with the main text sections, so put it by itself.
+		 */
+		*(.sfpr);
+		MEM_KEEP(init.text)
+		MEM_KEEP(exit.text)
+
+#ifdef CONFIG_PPC32
+		*(.got1)
+		__got2_start = .;
+		*(.got2)
+		__got2_end = .;
+#endif /* CONFIG_PPC32 */
+
+	} :kernel
+
+	. = ALIGN(PAGE_SIZE);
+	_etext = .;
+	PROVIDE32 (etext = .);
+
+	/* Read-only data */
+	RO_DATA(PAGE_SIZE)
+
+#ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__stf_entry_barrier_fixup : AT(ADDR(__stf_entry_barrier_fixup) - LOAD_OFFSET) {
+		__start___stf_entry_barrier_fixup = .;
+		*(__stf_entry_barrier_fixup)
+		__stop___stf_entry_barrier_fixup = .;
+	}
+
+	. = ALIGN(8);
+	__uaccess_flush_fixup : AT(ADDR(__uaccess_flush_fixup) - LOAD_OFFSET) {
+		__start___uaccess_flush_fixup = .;
+		*(__uaccess_flush_fixup)
+		__stop___uaccess_flush_fixup = .;
+	}
+
+	. = ALIGN(8);
+	__entry_flush_fixup : AT(ADDR(__entry_flush_fixup) - LOAD_OFFSET) {
+		__start___entry_flush_fixup = .;
+		*(__entry_flush_fixup)
+		__stop___entry_flush_fixup = .;
+	}
+
+	. = ALIGN(8);
+	__stf_exit_barrier_fixup : AT(ADDR(__stf_exit_barrier_fixup) - LOAD_OFFSET) {
+		__start___stf_exit_barrier_fixup = .;
+		*(__stf_exit_barrier_fixup)
+		__stop___stf_exit_barrier_fixup = .;
+	}
+
+	. = ALIGN(8);
+	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+		__start___rfi_flush_fixup = .;
+		*(__rfi_flush_fixup)
+		__stop___rfi_flush_fixup = .;
+	}
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_BARRIER_NOSPEC
+	. = ALIGN(8);
+	__spec_barrier_fixup : AT(ADDR(__spec_barrier_fixup) - LOAD_OFFSET) {
+		__start___barrier_nospec_fixup = .;
+		*(__barrier_nospec_fixup)
+		__stop___barrier_nospec_fixup = .;
+	}
+#endif /* CONFIG_PPC_BARRIER_NOSPEC */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	. = ALIGN(8);
+	__spec_btb_flush_fixup : AT(ADDR(__spec_btb_flush_fixup) - LOAD_OFFSET) {
+		__start__btb_flush_fixup = .;
+		*(__btb_flush_fixup)
+		__stop__btb_flush_fixup = .;
+	}
+#endif
+	EXCEPTION_TABLE(0)
+
+	NOTES :kernel :notes
+
+	/* The dummy segment contents for the bug workaround mentioned above
+	   near PHDRS.  */
+	.dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
+		LONG(0)
+		LONG(0)
+		LONG(0)
+	} :kernel :dummy
+
+/*
+ * Init sections discarded at runtime
+ */
+	. = ALIGN(STRICT_ALIGN_SIZE);
+	__init_begin = .;
+	INIT_TEXT_SECTION(PAGE_SIZE) :kernel
+
+	/* .exit.text is discarded at runtime, not link time,
+	 * to deal with references from __bug_table
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		INIT_DATA
+		__vtop_table_begin = .;
+		KEEP(*(.vtop_fixup));
+		__vtop_table_end = .;
+		__ptov_table_begin = .;
+		KEEP(*(.ptov_fixup));
+		__ptov_table_end = .;
+	}
+
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		INIT_SETUP(16)
+	}
+
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		INIT_CALLS
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		CON_INITCALL
+	}
+
+	SECURITY_INIT
+
+	. = ALIGN(8);
+	__ftr_fixup : AT(ADDR(__ftr_fixup) - LOAD_OFFSET) {
+		__start___ftr_fixup = .;
+		KEEP(*(__ftr_fixup))
+		__stop___ftr_fixup = .;
+	}
+	. = ALIGN(8);
+	__mmu_ftr_fixup : AT(ADDR(__mmu_ftr_fixup) - LOAD_OFFSET) {
+		__start___mmu_ftr_fixup = .;
+		KEEP(*(__mmu_ftr_fixup))
+		__stop___mmu_ftr_fixup = .;
+	}
+	. = ALIGN(8);
+	__lwsync_fixup : AT(ADDR(__lwsync_fixup) - LOAD_OFFSET) {
+		__start___lwsync_fixup = .;
+		KEEP(*(__lwsync_fixup))
+		__stop___lwsync_fixup = .;
+	}
+#ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__fw_ftr_fixup : AT(ADDR(__fw_ftr_fixup) - LOAD_OFFSET) {
+		__start___fw_ftr_fixup = .;
+		KEEP(*(__fw_ftr_fixup))
+		__stop___fw_ftr_fixup = .;
+	}
+#endif
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		INIT_RAM_FS
+	}
+
+	PERCPU_SECTION(L1_CACHE_BYTES)
+
+	. = ALIGN(8);
+	.machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) {
+		__machine_desc_start = . ;
+		KEEP(*(.machine.desc))
+		__machine_desc_end = . ;
+	}
+#ifdef CONFIG_RELOCATABLE
+	. = ALIGN(8);
+	.dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET)
+	{
+#ifdef CONFIG_PPC32
+		__dynamic_symtab = .;
+#endif
+		*(.dynsym)
+	}
+	.dynstr : AT(ADDR(.dynstr) - LOAD_OFFSET) { *(.dynstr) }
+	.dynamic : AT(ADDR(.dynamic) - LOAD_OFFSET)
+	{
+		__dynamic_start = .;
+		*(.dynamic)
+	}
+	.hash : AT(ADDR(.hash) - LOAD_OFFSET) { *(.hash) }
+	.interp : AT(ADDR(.interp) - LOAD_OFFSET) { *(.interp) }
+	.rela.dyn : AT(ADDR(.rela.dyn) - LOAD_OFFSET)
+	{
+		__rela_dyn_start = .;
+		*(.rela*)
+	}
+#endif
+	/* .exit.data is discarded at runtime, not link time,
+	 * to deal with references from .exit.text
+	 */
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
+
+	/* freed after init ends here */
+	. = ALIGN(PAGE_SIZE);
+	__init_end = .;
+
+/*
+ * And now the various read/write data
+ */
+
+	. = ALIGN(PAGE_SIZE);
+	_sdata = .;
+
+#ifdef CONFIG_PPC32
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+#ifdef CONFIG_UBSAN
+		*(.data..Lubsan_data*)
+		*(.data..Lubsan_type*)
+#endif
+		*(.data.rel*)
+		*(SDATA_MAIN)
+		*(.sdata2)
+		*(.got.plt) *(.got)
+		*(.plt)
+	}
+#else
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		*(.data.rel*)
+		*(.toc1)
+		*(.branch_lt)
+	}
+
+#ifdef CONFIG_DEBUG_INFO_BTF
+	.BTF : AT(ADDR(.BTF) - LOAD_OFFSET) {
+		*(.BTF)
+	}
+#endif
+
+	.opd : AT(ADDR(.opd) - LOAD_OFFSET) {
+		__start_opd = .;
+		KEEP(*(.opd))
+		__end_opd = .;
+	}
+
+	. = ALIGN(256);
+	.got : AT(ADDR(.got) - LOAD_OFFSET) {
+		__toc_start = .;
+#ifndef CONFIG_RELOCATABLE
+		__prom_init_toc_start = .;
+		arch/powerpc/kernel/prom_init.o*(.toc .got)
+		__prom_init_toc_end = .;
+#endif
+		*(.got)
+		*(.toc)
+	}
+#endif
+
+	/* The initial task and kernel stack */
+	INIT_TASK_DATA_SECTION(THREAD_SIZE)
+
+	.data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) {
+		PAGE_ALIGNED_DATA(PAGE_SIZE)
+	}
+
+	.data..cacheline_aligned : AT(ADDR(.data..cacheline_aligned) - LOAD_OFFSET) {
+		CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
+	}
+
+	.data..read_mostly : AT(ADDR(.data..read_mostly) - LOAD_OFFSET) {
+		READ_MOSTLY_DATA(L1_CACHE_BYTES)
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		NOSAVE_DATA
+	}
+
+	BUG_TABLE
+
+	. = ALIGN(PAGE_SIZE);
+	_edata  =  .;
+	PROVIDE32 (edata = .);
+
+/*
+ * And finally the bss
+ */
+
+	BSS_SECTION(0, 0, 0)
+
+	. = ALIGN(PAGE_SIZE);
+	_end = . ;
+	PROVIDE32 (end = .);
+
+	STABS_DEBUG
+
+	DWARF_DEBUG
+
+	DISCARDS
+	/DISCARD/ : {
+		*(*.EMB.apuinfo)
+		*(.glink .iplt .plt .rela* .comment)
+		*(.gnu.version*)
+		*(.gnu.attributes)
+		*(.eh_frame)
+	}
+}
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
new file mode 100644
index 000000000..75b2a6c4d
--- /dev/null
+++ b/arch/powerpc/kernel/watchdog.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Watchdog support on powerpc systems.
+ *
+ * Copyright 2017, IBM Corporation.
+ *
+ * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
+ */
+
+#define pr_fmt(fmt) "watchdog: " fmt
+
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+
+#include <asm/paca.h>
+
+/*
+ * The powerpc watchdog ensures that each CPU is able to service timers.
+ * The watchdog sets up a simple timer on each CPU to run once per timer
+ * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
+ * the heartbeat.
+ *
+ * Then there are two systems to check that the heartbeat is still running.
+ * The local soft-NMI, and the SMP checker.
+ *
+ * The soft-NMI checker can detect lockups on the local CPU. When interrupts
+ * are disabled with local_irq_disable(), platforms that use soft-masking
+ * can leave hardware interrupts enabled and handle them with a masked
+ * interrupt handler. The masked handler can send the timer interrupt to the
+ * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
+ * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
+ *
+ * The soft-NMI checker will compare the heartbeat timestamp for this CPU
+ * with the current time, and take action if the difference exceeds the
+ * watchdog threshold.
+ *
+ * The limitation of the soft-NMI watchdog is that it does not work when
+ * interrupts are hard disabled or otherwise not being serviced. This is
+ * solved by also having a SMP watchdog where all CPUs check all other
+ * CPUs heartbeat.
+ *
+ * The SMP checker can detect lockups on other CPUs. A gobal "pending"
+ * cpumask is kept, containing all CPUs which enable the watchdog. Each
+ * CPU clears their pending bit in their heartbeat timer. When the bitmask
+ * becomes empty, the last CPU to clear its pending bit updates a global
+ * timestamp and refills the pending bitmask.
+ *
+ * In the heartbeat timer, if any CPU notices that the global timestamp has
+ * not been updated for a period exceeding the watchdog threshold, then it
+ * means the CPU(s) with their bit still set in the pending mask have had
+ * their heartbeat stop, and action is taken.
+ *
+ * Some platforms implement true NMI IPIs, which can be used by the SMP
+ * watchdog to detect an unresponsive CPU and pull it out of its stuck
+ * state with the NMI IPI, to get crash/debug data from it. This way the
+ * SMP watchdog can detect hardware interrupts off lockups.
+ */
+
+static cpumask_t wd_cpus_enabled __read_mostly;
+
+static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
+static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
+
+static u64 wd_timer_period_ms __read_mostly;  /* interval between heartbeat */
+
+static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
+static DEFINE_PER_CPU(u64, wd_timer_tb);
+
+/* SMP checker bits */
+static unsigned long __wd_smp_lock;
+static cpumask_t wd_smp_cpus_pending;
+static cpumask_t wd_smp_cpus_stuck;
+static u64 wd_smp_last_reset_tb;
+
+static inline void wd_smp_lock(unsigned long *flags)
+{
+	/*
+	 * Avoid locking layers if possible.
+	 * This may be called from low level interrupt handlers at some
+	 * point in future.
+	 */
+	raw_local_irq_save(*flags);
+	hard_irq_disable(); /* Make it soft-NMI safe */
+	while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
+		raw_local_irq_restore(*flags);
+		spin_until_cond(!test_bit(0, &__wd_smp_lock));
+		raw_local_irq_save(*flags);
+		hard_irq_disable();
+	}
+}
+
+static inline void wd_smp_unlock(unsigned long *flags)
+{
+	clear_bit_unlock(0, &__wd_smp_lock);
+	raw_local_irq_restore(*flags);
+}
+
+static void wd_lockup_ipi(struct pt_regs *regs)
+{
+	int cpu = raw_smp_processor_id();
+	u64 tb = get_tb();
+
+	pr_emerg("CPU %d Hard LOCKUP\n", cpu);
+	pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
+		 cpu, tb, per_cpu(wd_timer_tb, cpu),
+		 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
+	print_modules();
+	print_irqtrace_events(current);
+	if (regs)
+		show_regs(regs);
+	else
+		dump_stack();
+
+	/* Do not panic from here because that can recurse into NMI IPI layer */
+}
+
+static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
+{
+	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
+	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
+	/*
+	 * See wd_smp_clear_cpu_pending()
+	 */
+	smp_mb();
+	if (cpumask_empty(&wd_smp_cpus_pending)) {
+		wd_smp_last_reset_tb = tb;
+		cpumask_andnot(&wd_smp_cpus_pending,
+				&wd_cpus_enabled,
+				&wd_smp_cpus_stuck);
+	}
+}
+static void set_cpu_stuck(int cpu, u64 tb)
+{
+	set_cpumask_stuck(cpumask_of(cpu), tb);
+}
+
+static void watchdog_smp_panic(int cpu, u64 tb)
+{
+	unsigned long flags;
+	int c;
+
+	wd_smp_lock(&flags);
+	/* Double check some things under lock */
+	if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
+		goto out;
+	if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
+		goto out;
+	if (cpumask_weight(&wd_smp_cpus_pending) == 0)
+		goto out;
+
+	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
+		 cpu, cpumask_pr_args(&wd_smp_cpus_pending));
+	pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
+		 cpu, tb, wd_smp_last_reset_tb,
+		 tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000);
+
+	if (!sysctl_hardlockup_all_cpu_backtrace) {
+		/*
+		 * Try to trigger the stuck CPUs, unless we are going to
+		 * get a backtrace on all of them anyway.
+		 */
+		for_each_cpu(c, &wd_smp_cpus_pending) {
+			if (c == cpu)
+				continue;
+			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+		}
+	}
+
+	/* Take the stuck CPUs out of the watch group */
+	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
+
+	wd_smp_unlock(&flags);
+
+	printk_safe_flush();
+	/*
+	 * printk_safe_flush() seems to require another print
+	 * before anything actually goes out to console.
+	 */
+	if (sysctl_hardlockup_all_cpu_backtrace)
+		trigger_allbutself_cpu_backtrace();
+
+	if (hardlockup_panic)
+		nmi_panic(NULL, "Hard LOCKUP");
+
+	return;
+
+out:
+	wd_smp_unlock(&flags);
+}
+
+static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
+{
+	if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
+		if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
+			struct pt_regs *regs = get_irq_regs();
+			unsigned long flags;
+
+			wd_smp_lock(&flags);
+
+			pr_emerg("CPU %d became unstuck TB:%lld\n",
+				 cpu, tb);
+			print_irqtrace_events(current);
+			if (regs)
+				show_regs(regs);
+			else
+				dump_stack();
+
+			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
+			wd_smp_unlock(&flags);
+		} else {
+			/*
+			 * The last CPU to clear pending should have reset the
+			 * watchdog so we generally should not find it empty
+			 * here if our CPU was clear. However it could happen
+			 * due to a rare race with another CPU taking the
+			 * last CPU out of the mask concurrently.
+			 *
+			 * We can't add a warning for it. But just in case
+			 * there is a problem with the watchdog that is causing
+			 * the mask to not be reset, try to kick it along here.
+			 */
+			if (unlikely(cpumask_empty(&wd_smp_cpus_pending)))
+				goto none_pending;
+		}
+		return;
+	}
+
+	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+
+	/*
+	 * Order the store to clear pending with the load(s) to check all
+	 * words in the pending mask to check they are all empty. This orders
+	 * with the same barrier on another CPU. This prevents two CPUs
+	 * clearing the last 2 pending bits, but neither seeing the other's
+	 * store when checking if the mask is empty, and missing an empty
+	 * mask, which ends with a false positive.
+	 */
+	smp_mb();
+	if (cpumask_empty(&wd_smp_cpus_pending)) {
+		unsigned long flags;
+
+none_pending:
+		/*
+		 * Double check under lock because more than one CPU could see
+		 * a clear mask with the lockless check after clearing their
+		 * pending bits.
+		 */
+		wd_smp_lock(&flags);
+		if (cpumask_empty(&wd_smp_cpus_pending)) {
+			wd_smp_last_reset_tb = tb;
+			cpumask_andnot(&wd_smp_cpus_pending,
+					&wd_cpus_enabled,
+					&wd_smp_cpus_stuck);
+		}
+		wd_smp_unlock(&flags);
+	}
+}
+
+static void watchdog_timer_interrupt(int cpu)
+{
+	u64 tb = get_tb();
+
+	per_cpu(wd_timer_tb, cpu) = tb;
+
+	wd_smp_clear_cpu_pending(cpu, tb);
+
+	if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
+		watchdog_smp_panic(cpu, tb);
+}
+
+void soft_nmi_interrupt(struct pt_regs *regs)
+{
+	unsigned long flags;
+	int cpu = raw_smp_processor_id();
+	u64 tb;
+
+	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
+		return;
+
+	nmi_enter();
+
+	__this_cpu_inc(irq_stat.soft_nmi_irqs);
+
+	tb = get_tb();
+	if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
+		wd_smp_lock(&flags);
+		if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
+			wd_smp_unlock(&flags);
+			goto out;
+		}
+		set_cpu_stuck(cpu, tb);
+
+		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
+			 cpu, (void *)regs->nip);
+		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
+			 cpu, tb, per_cpu(wd_timer_tb, cpu),
+			 tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
+		print_modules();
+		print_irqtrace_events(current);
+		show_regs(regs);
+
+		wd_smp_unlock(&flags);
+
+		if (sysctl_hardlockup_all_cpu_backtrace)
+			trigger_allbutself_cpu_backtrace();
+
+		if (hardlockup_panic)
+			nmi_panic(regs, "Hard LOCKUP");
+	}
+	if (wd_panic_timeout_tb < 0x7fffffff)
+		mtspr(SPRN_DEC, wd_panic_timeout_tb);
+
+out:
+	nmi_exit();
+}
+
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+	int cpu = smp_processor_id();
+
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		return HRTIMER_NORESTART;
+
+	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+		return HRTIMER_NORESTART;
+
+	watchdog_timer_interrupt(cpu);
+
+	hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
+
+	return HRTIMER_RESTART;
+}
+
+void arch_touch_nmi_watchdog(void)
+{
+	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
+	int cpu = smp_processor_id();
+	u64 tb;
+
+	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+		return;
+
+	tb = get_tb();
+	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+		per_cpu(wd_timer_tb, cpu) = tb;
+		wd_smp_clear_cpu_pending(cpu, tb);
+	}
+}
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+static void start_watchdog(void *arg)
+{
+	struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
+	int cpu = smp_processor_id();
+	unsigned long flags;
+
+	if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
+		WARN_ON(1);
+		return;
+	}
+
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		return;
+
+	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+		return;
+
+	wd_smp_lock(&flags);
+	cpumask_set_cpu(cpu, &wd_cpus_enabled);
+	if (cpumask_weight(&wd_cpus_enabled) == 1) {
+		cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
+		wd_smp_last_reset_tb = get_tb();
+	}
+	wd_smp_unlock(&flags);
+
+	*this_cpu_ptr(&wd_timer_tb) = get_tb();
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+	hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static int start_watchdog_on_cpu(unsigned int cpu)
+{
+	return smp_call_function_single(cpu, start_watchdog, NULL, true);
+}
+
+static void stop_watchdog(void *arg)
+{
+	struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
+	int cpu = smp_processor_id();
+	unsigned long flags;
+
+	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
+		return; /* Can happen in CPU unplug case */
+
+	hrtimer_cancel(hrtimer);
+
+	wd_smp_lock(&flags);
+	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
+	wd_smp_unlock(&flags);
+
+	wd_smp_clear_cpu_pending(cpu, get_tb());
+}
+
+static int stop_watchdog_on_cpu(unsigned int cpu)
+{
+	return smp_call_function_single(cpu, stop_watchdog, NULL, true);
+}
+
+static void watchdog_calc_timeouts(void)
+{
+	wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
+
+	/* Have the SMP detector trigger a bit later */
+	wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
+
+	/* 2/5 is the factor that the perf based detector uses */
+	wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
+}
+
+void watchdog_nmi_stop(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &wd_cpus_enabled)
+		stop_watchdog_on_cpu(cpu);
+}
+
+void watchdog_nmi_start(void)
+{
+	int cpu;
+
+	watchdog_calc_timeouts();
+	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
+		start_watchdog_on_cpu(cpu);
+}
+
+/*
+ * Invoked from core watchdog init.
+ */
+int __init watchdog_nmi_probe(void)
+{
+	int err;
+
+	err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"powerpc/watchdog:online",
+					start_watchdog_on_cpu,
+					stop_watchdog_on_cpu);
+	if (err < 0) {
+		pr_warn("could not be initialized");
+		return err;
+	}
+	return 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
commit	76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree	f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/powerpc/kernel
parent	Initial commit. (diff)
download	linux-c109f8d9e922037b3fa45f46d78384d49db8ad76.tar.xz linux-c109f8d9e922037b3fa45f46d78384d49db8ad76.zip