From ace9429bb58fd418f0c81d4c2835699bddf6bde6 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Thu, 11 Apr 2024 10:27:49 +0200
Subject: Adding upstream version 6.6.15.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 arch/powerpc/kvm/Kconfig                 |  253 ++
 arch/powerpc/kvm/Makefile                |  139 +
 arch/powerpc/kvm/book3s.c                | 1113 ++++++
 arch/powerpc/kvm/book3s.h                |   38 +
 arch/powerpc/kvm/book3s_32_mmu.c         |  415 ++
 arch/powerpc/kvm/book3s_32_mmu_host.c    |  395 ++
 arch/powerpc/kvm/book3s_32_sr.S          |  148 +
 arch/powerpc/kvm/book3s_64_entry.S       |  429 ++
 arch/powerpc/kvm/book3s_64_mmu.c         |  670 ++++
 arch/powerpc/kvm/book3s_64_mmu_host.c    |  407 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c      | 2150 ++++++++++
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 1492 +++++++
 arch/powerpc/kvm/book3s_64_slb.S         |  145 +
 arch/powerpc/kvm/book3s_64_vio.c         |  798 ++++
 arch/powerpc/kvm/book3s_emulate.c        | 1072 +++++
 arch/powerpc/kvm/book3s_exports.c        |   19 +
 arch/powerpc/kvm/book3s_hv.c             | 6360 ++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.h             |  120 +
 arch/powerpc/kvm/book3s_hv_builtin.c     |  625 +++
 arch/powerpc/kvm/book3s_hv_hmi.c         |   50 +
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  158 +
 arch/powerpc/kvm/book3s_hv_nested.c      | 1678 ++++++++
 arch/powerpc/kvm/book3s_hv_p9_entry.c    |  930 +++++
 arch/powerpc/kvm/book3s_hv_p9_perf.c     |  219 +
 arch/powerpc/kvm/book3s_hv_ras.c         |  377 ++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      | 1300 ++++++
 arch/powerpc/kvm/book3s_hv_rm_xics.c     |  924 +++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 3026 ++++++++++++++
 arch/powerpc/kvm/book3s_hv_tm.c          |  248 ++
 arch/powerpc/kvm/book3s_hv_tm_builtin.c  |  119 +
 arch/powerpc/kvm/book3s_hv_uvmem.c       | 1223 ++++++
 arch/powerpc/kvm/book3s_interrupts.S     |  239 ++
 arch/powerpc/kvm/book3s_mmu_hpte.c       |  386 ++
 arch/powerpc/kvm/book3s_paired_singles.c | 1263 ++++++
 arch/powerpc/kvm/book3s_pr.c             | 2123 ++++++++++
 arch/powerpc/kvm/book3s_pr_papr.c        |  496 +++
 arch/powerpc/kvm/book3s_rmhandlers.S     |  163 +
 arch/powerpc/kvm/book3s_rtas.c           |  307 ++
 arch/powerpc/kvm/book3s_segment.S        |  412 ++
 arch/powerpc/kvm/book3s_xics.c           | 1507 +++++++
 arch/powerpc/kvm/book3s_xics.h           |  153 +
 arch/powerpc/kvm/book3s_xive.c           | 2982 ++++++++++++++
 arch/powerpc/kvm/book3s_xive.h           |  313 ++
 arch/powerpc/kvm/book3s_xive_native.c    | 1284 ++++++
 arch/powerpc/kvm/booke.c                 | 2242 +++++++++++
 arch/powerpc/kvm/booke.h                 |  115 +
 arch/powerpc/kvm/booke_emulate.c         |  511 +++
 arch/powerpc/kvm/booke_interrupts.S      |  535 +++
 arch/powerpc/kvm/bookehv_interrupts.S    |  673 ++++
 arch/powerpc/kvm/e500.c                  |  553 +++
 arch/powerpc/kvm/e500.h                  |  339 ++
 arch/powerpc/kvm/e500_emulate.c          |  452 +++
 arch/powerpc/kvm/e500_mmu.c              |  956 +++++
 arch/powerpc/kvm/e500_mmu_host.c         |  803 ++++
 arch/powerpc/kvm/e500_mmu_host.h         |   15 +
 arch/powerpc/kvm/e500mc.c                |  431 ++
 arch/powerpc/kvm/emulate.c               |  313 ++
 arch/powerpc/kvm/emulate_loadstore.c     |  366 ++
 arch/powerpc/kvm/fpu.S                   |  285 ++
 arch/powerpc/kvm/mpic.c                  | 1852 +++++++++
 arch/powerpc/kvm/powerpc.c               | 2552 ++++++++++++
 arch/powerpc/kvm/timing.c                |  213 +
 arch/powerpc/kvm/timing.h                |   99 +
 arch/powerpc/kvm/tm.S                    |  398 ++
 arch/powerpc/kvm/trace.h                 |  127 +
 arch/powerpc/kvm/trace_book3s.h          |   32 +
 arch/powerpc/kvm/trace_booke.h           |  211 +
 arch/powerpc/kvm/trace_hv.h              |  525 +++
 arch/powerpc/kvm/trace_pr.h              |  265 ++
 69 files changed, 53531 insertions(+)
 create mode 100644 arch/powerpc/kvm/Kconfig
 create mode 100644 arch/powerpc/kvm/Makefile
 create mode 100644 arch/powerpc/kvm/book3s.c
 create mode 100644 arch/powerpc/kvm/book3s.h
 create mode 100644 arch/powerpc/kvm/book3s_32_mmu.c
 create mode 100644 arch/powerpc/kvm/book3s_32_mmu_host.c
 create mode 100644 arch/powerpc/kvm/book3s_32_sr.S
 create mode 100644 arch/powerpc/kvm/book3s_64_entry.S
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu.c
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_host.c
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_radix.c
 create mode 100644 arch/powerpc/kvm/book3s_64_slb.S
 create mode 100644 arch/powerpc/kvm/book3s_64_vio.c
 create mode 100644 arch/powerpc/kvm/book3s_emulate.c
 create mode 100644 arch/powerpc/kvm/book3s_exports.c
 create mode 100644 arch/powerpc/kvm/book3s_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv.h
 create mode 100644 arch/powerpc/kvm/book3s_hv_builtin.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_hmi.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_nested.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_p9_entry.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_p9_perf.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_ras.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_mmu.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xics.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_tm.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_tm_builtin.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_uvmem.c
 create mode 100644 arch/powerpc/kvm/book3s_interrupts.S
 create mode 100644 arch/powerpc/kvm/book3s_mmu_hpte.c
 create mode 100644 arch/powerpc/kvm/book3s_paired_singles.c
 create mode 100644 arch/powerpc/kvm/book3s_pr.c
 create mode 100644 arch/powerpc/kvm/book3s_pr_papr.c
 create mode 100644 arch/powerpc/kvm/book3s_rmhandlers.S
 create mode 100644 arch/powerpc/kvm/book3s_rtas.c
 create mode 100644 arch/powerpc/kvm/book3s_segment.S
 create mode 100644 arch/powerpc/kvm/book3s_xics.c
 create mode 100644 arch/powerpc/kvm/book3s_xics.h
 create mode 100644 arch/powerpc/kvm/book3s_xive.c
 create mode 100644 arch/powerpc/kvm/book3s_xive.h
 create mode 100644 arch/powerpc/kvm/book3s_xive_native.c
 create mode 100644 arch/powerpc/kvm/booke.c
 create mode 100644 arch/powerpc/kvm/booke.h
 create mode 100644 arch/powerpc/kvm/booke_emulate.c
 create mode 100644 arch/powerpc/kvm/booke_interrupts.S
 create mode 100644 arch/powerpc/kvm/bookehv_interrupts.S
 create mode 100644 arch/powerpc/kvm/e500.c
 create mode 100644 arch/powerpc/kvm/e500.h
 create mode 100644 arch/powerpc/kvm/e500_emulate.c
 create mode 100644 arch/powerpc/kvm/e500_mmu.c
 create mode 100644 arch/powerpc/kvm/e500_mmu_host.c
 create mode 100644 arch/powerpc/kvm/e500_mmu_host.h
 create mode 100644 arch/powerpc/kvm/e500mc.c
 create mode 100644 arch/powerpc/kvm/emulate.c
 create mode 100644 arch/powerpc/kvm/emulate_loadstore.c
 create mode 100644 arch/powerpc/kvm/fpu.S
 create mode 100644 arch/powerpc/kvm/mpic.c
 create mode 100644 arch/powerpc/kvm/powerpc.c
 create mode 100644 arch/powerpc/kvm/timing.c
 create mode 100644 arch/powerpc/kvm/timing.h
 create mode 100644 arch/powerpc/kvm/tm.S
 create mode 100644 arch/powerpc/kvm/trace.h
 create mode 100644 arch/powerpc/kvm/trace_book3s.h
 create mode 100644 arch/powerpc/kvm/trace_booke.h
 create mode 100644 arch/powerpc/kvm/trace_hv.h
 create mode 100644 arch/powerpc/kvm/trace_pr.h

(limited to 'arch/powerpc/kvm')

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
new file mode 100644
index 0000000000..9026119542
--- /dev/null
+++ b/arch/powerpc/kvm/Kconfig
@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+	bool "Virtualization"
+	help
+	  Say Y here to get to see options for using your Linux host to run
+	  other operating systems inside virtual machines (guests).
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled.
+
+if VIRTUALIZATION
+
+config KVM
+	bool
+	select PREEMPT_NOTIFIERS
+	select HAVE_KVM_EVENTFD
+	select HAVE_KVM_VCPU_ASYNC_IOCTL
+	select KVM_VFIO
+	select IRQ_BYPASS_MANAGER
+	select HAVE_KVM_IRQ_BYPASS
+	select INTERVAL_TREE
+
+config KVM_BOOK3S_HANDLER
+	bool
+
+config KVM_BOOK3S_32_HANDLER
+	bool
+	select KVM_BOOK3S_HANDLER
+	select KVM_MMIO
+
+config KVM_BOOK3S_64_HANDLER
+	bool
+	select KVM_BOOK3S_HANDLER
+
+config KVM_BOOK3S_PR_POSSIBLE
+	bool
+	select KVM_MMIO
+	select MMU_NOTIFIER
+
+config KVM_BOOK3S_HV_POSSIBLE
+	bool
+
+config KVM_BOOK3S_32
+	tristate "KVM support for PowerPC book3s_32 processors"
+	depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT
+	depends on !CONTEXT_TRACKING_USER
+	select KVM
+	select KVM_BOOK3S_32_HANDLER
+	select KVM_BOOK3S_PR_POSSIBLE
+	select PPC_FPU
+	help
+	  Support running unmodified book3s_32 guest kernels
+	  in virtual machines on book3s_32 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64
+	tristate "KVM support for PowerPC book3s_64 processors"
+	depends on PPC_BOOK3S_64
+	select KVM_BOOK3S_64_HANDLER
+	select KVM
+	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+	select PPC_64S_HASH_MMU
+	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_PSERIES || PPC_POWERNV)
+	help
+	  Support running unmodified book3s_64 and book3s_32 guest kernels
+	  in virtual machines on book3s_64 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_HV
+	tristate "KVM for POWER7 and later using hypervisor mode in host"
+	depends on KVM_BOOK3S_64 && PPC_POWERNV
+	select KVM_BOOK3S_HV_POSSIBLE
+	select MMU_NOTIFIER
+	select CMA
+	help
+	  Support running unmodified book3s_64 guest kernels in
+	  virtual machines on POWER7 and newer processors that have
+	  hypervisor mode available to the host.
+
+	  If you say Y here, KVM will use the hardware virtualization
+	  facilities of POWER7 (and later) processors, meaning that
+	  guest operating systems will run at full hardware speed
+	  using supervisor and user modes.  However, this also means
+	  that KVM is not usable under PowerVM (pHyp), is only usable
+	  on POWER7 or later processors, and cannot emulate a
+	  different processor from the host processor.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+	tristate "KVM support without using hypervisor mode in host"
+	depends on KVM_BOOK3S_64
+	depends on !CONTEXT_TRACKING_USER
+	select KVM_BOOK3S_PR_POSSIBLE
+	help
+	  Support running guest kernels in virtual machines on processors
+	  without using hypervisor mode in the host, by running the
+	  guest in user mode (problem state) and emulating all
+	  privileged instructions and registers.
+
+	  This is only available for hash MMU mode and only supports
+	  guests that use hash MMU mode.
+
+	  This is not as fast as using hypervisor mode, but works on
+	  machines where hypervisor mode is not available or not usable,
+	  and can emulate processors that are different from the host
+	  processor, including emulating 32-bit processors on a 64-bit
+	  host.
+
+	  Selecting this option will cause the SCV facility to be
+	  disabled when the kernel is booted on the pseries platform in
+	  hash MMU mode (regardless of PR VMs running). When any PR VMs
+	  are running, "AIL" mode is disabled which may slow interrupts
+	  and system calls on the host.
+
+config KVM_BOOK3S_HV_EXIT_TIMING
+	bool
+
+config KVM_BOOK3S_HV_P9_TIMING
+	bool "Detailed timing for the P9 entry point"
+	select KVM_BOOK3S_HV_EXIT_TIMING
+	depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+	help
+	  Calculate time taken for each vcpu during vcpu entry and
+	  exit, time spent inside the guest and time spent handling
+	  hypercalls and page faults. The total, minimum and maximum
+	  times in nanoseconds together with the number of executions
+	  are reported in debugfs in kvm/vm#/vcpu#/timings.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_HV_P8_TIMING
+	bool "Detailed timing for hypervisor real-mode code (for POWER8)"
+	select KVM_BOOK3S_HV_EXIT_TIMING
+	depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS && !KVM_BOOK3S_HV_P9_TIMING
+	help
+	  Calculate time taken for each vcpu in the real-mode guest entry,
+	  exit, and interrupt handling code, plus time spent in the guest
+	  and in nap mode due to idle (cede) while other threads are still
+	  in the guest.  The total, minimum and maximum times in nanoseconds
+	  together with the number of executions are reported in debugfs in
+	  kvm/vm#/vcpu#/timings.  The overhead is of the order of 30 - 40
+	  ns per exit on POWER8.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND
+	bool "Nested L0 host workaround for L1 KVM host PMU handling bug" if EXPERT
+	depends on KVM_BOOK3S_HV_POSSIBLE
+	default !EXPERT
+	help
+	  Old nested HV capable Linux guests have a bug where they don't
+	  reflect the PMU in-use status of their L2 guest to the L0 host
+	  while the L2 PMU registers are live. This can result in loss
+	  of L2 PMU register state, causing perf to not work correctly in
+	  L2 guests.
+
+	  Selecting this option for the L0 host implements a workaround for
+	  those buggy L1s which saves the L2 state, at the cost of performance
+	  in all nested-capable guest entry/exit.
+
+config KVM_BOOKE_HV
+	bool
+
+config KVM_EXIT_TIMING
+	bool "Detailed exit timing"
+	depends on KVM_E500V2 || KVM_E500MC
+	help
+	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
+	  report is available in debugfs kvm/vm#_vcpu#_timing.
+	  The overhead is relatively small, however it is not recommended for
+	  production environments.
+
+	  If unsure, say N.
+
+config KVM_E500V2
+	bool "KVM support for PowerPC E500v2 processors"
+	depends on PPC_E500 && !PPC_E500MC
+	depends on !CONTEXT_TRACKING_USER
+	select KVM
+	select KVM_MMIO
+	select MMU_NOTIFIER
+	help
+	  Support running unmodified E500 guest kernels in virtual machines on
+	  E500v2 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_E500MC
+	bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
+	depends on PPC_E500MC
+	depends on !CONTEXT_TRACKING_USER
+	select KVM
+	select KVM_MMIO
+	select KVM_BOOKE_HV
+	select MMU_NOTIFIER
+	help
+	  Support running unmodified E500MC/E5500/E6500 guest kernels in
+	  virtual machines on E500MC/E5500/E6500 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_MPIC
+	bool "KVM in-kernel MPIC emulation"
+	depends on KVM && PPC_E500
+	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQFD
+	select HAVE_KVM_IRQ_ROUTING
+	select HAVE_KVM_MSI
+	help
+	  Enable support for emulating MPIC devices inside the
+	  host kernel, rather than relying on userspace to emulate.
+	  Currently, support is limited to certain versions of
+	  Freescale's MPIC implementation.
+
+config KVM_XICS
+	bool "KVM in-kernel XICS emulation"
+	depends on KVM_BOOK3S_64 && !KVM_MPIC
+	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQFD
+	default y
+	help
+	  Include support for the XICS (eXternal Interrupt Controller
+	  Specification) interrupt controller architecture used on
+	  IBM POWER (pSeries) servers.
+
+config KVM_XIVE
+	bool
+	default y
+	depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
+
+endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
new file mode 100644
index 0000000000..5319d889b1
--- /dev/null
+++ b/arch/powerpc/kvm/Makefile
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
+
+include $(srctree)/virt/kvm/Makefile.kvm
+
+common-objs-y += powerpc.o emulate_loadstore.o
+obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
+obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
+
+AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
+
+kvm-e500-objs := \
+	$(common-objs-y) \
+	emulate.o \
+	booke.o \
+	booke_emulate.o \
+	booke_interrupts.o \
+	e500.o \
+	e500_mmu.o \
+	e500_mmu_host.o \
+	e500_emulate.o
+kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
+
+kvm-e500mc-objs := \
+	$(common-objs-y) \
+	emulate.o \
+	booke.o \
+	booke_emulate.o \
+	bookehv_interrupts.o \
+	e500mc.o \
+	e500_mmu.o \
+	e500_mmu_host.o \
+	e500_emulate.o
+kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
+
+kvm-pr-y := \
+	fpu.o \
+	emulate.o \
+	book3s_paired_singles.o \
+	book3s_pr.o \
+	book3s_pr_papr.o \
+	book3s_emulate.o \
+	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
+	book3s_64_mmu_host.o \
+	book3s_64_mmu.o \
+	book3s_32_mmu.o
+
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_64_entry.o \
+	tm.o
+
+ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_rmhandlers.o
+endif
+
+kvm-hv-y += \
+	book3s_hv.o \
+	book3s_hv_interrupts.o \
+	book3s_64_mmu_hv.o \
+	book3s_64_mmu_radix.o \
+	book3s_hv_nested.o
+
+kvm-hv-$(CONFIG_PPC_UV) += \
+	book3s_hv_uvmem.o
+
+kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
+	book3s_hv_tm.o
+
+kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
+	book3s_hv_rm_xics.o
+
+kvm-book3s_64-builtin-tm-objs-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
+	book3s_hv_tm_builtin.o
+
+ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_hv_hmi.o \
+	book3s_hv_p9_entry.o \
+	book3s_hv_rmhandlers.o \
+	book3s_hv_rm_mmu.o \
+	book3s_hv_ras.o \
+	book3s_hv_builtin.o \
+	book3s_hv_p9_perf.o \
+	$(kvm-book3s_64-builtin-tm-objs-y) \
+	$(kvm-book3s_64-builtin-xics-objs-y)
+endif
+
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+	book3s_xics.o
+
+kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o book3s_xive_native.o
+kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o
+
+kvm-book3s_64-module-objs := \
+	$(common-objs-y) \
+	book3s.o \
+	book3s_rtas.o \
+	$(kvm-book3s_64-objs-y)
+
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
+
+kvm-book3s_32-objs := \
+	$(common-objs-y) \
+	emulate.o \
+	fpu.o \
+	book3s_paired_singles.o \
+	book3s.o \
+	book3s_pr.o \
+	book3s_emulate.o \
+	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
+	book3s_32_mmu_host.o \
+	book3s_32_mmu.o
+kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
+
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+
+kvm-y += $(kvm-objs-m) $(kvm-objs-y)
+
+obj-$(CONFIG_KVM_E500V2) += kvm.o
+obj-$(CONFIG_KVM_E500MC) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
+
+obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o
+obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o
+
+obj-y += $(kvm-book3s_64-builtin-objs-y)
+
+# KVM does a lot in real-mode, and 64-bit Book3S KASAN doesn't support that
+ifdef CONFIG_PPC_BOOK3S_64
+KASAN_SANITIZE := n
+endif
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
new file mode 100644
index 0000000000..686d8d9eda
--- /dev/null
+++ b/arch/powerpc/kvm/book3s.c
@@ -0,0 +1,1113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/xive.h>
+
+#include "book3s.h"
+#include "trace.h"
+
+/* #define EXIT_DEBUG */
+
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_ICOUNTER(VM, num_2M_pages),
+	STATS_DESC_ICOUNTER(VM, num_1G_pages)
+};
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, sum_exits),
+	STATS_DESC_COUNTER(VCPU, mmio_exits),
+	STATS_DESC_COUNTER(VCPU, signal_exits),
+	STATS_DESC_COUNTER(VCPU, light_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, syscall_exits),
+	STATS_DESC_COUNTER(VCPU, isi_exits),
+	STATS_DESC_COUNTER(VCPU, dsi_exits),
+	STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
+	STATS_DESC_COUNTER(VCPU, dec_exits),
+	STATS_DESC_COUNTER(VCPU, ext_intr_exits),
+	STATS_DESC_COUNTER(VCPU, halt_successful_wait),
+	STATS_DESC_COUNTER(VCPU, dbell_exits),
+	STATS_DESC_COUNTER(VCPU, gdbell_exits),
+	STATS_DESC_COUNTER(VCPU, ld),
+	STATS_DESC_COUNTER(VCPU, st),
+	STATS_DESC_COUNTER(VCPU, pf_storage),
+	STATS_DESC_COUNTER(VCPU, pf_instruc),
+	STATS_DESC_COUNTER(VCPU, sp_storage),
+	STATS_DESC_COUNTER(VCPU, sp_instruc),
+	STATS_DESC_COUNTER(VCPU, queue_intr),
+	STATS_DESC_COUNTER(VCPU, ld_slow),
+	STATS_DESC_COUNTER(VCPU, st_slow),
+	STATS_DESC_COUNTER(VCPU, pthru_all),
+	STATS_DESC_COUNTER(VCPU, pthru_host),
+	STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	if (is_kvmppc_hv_enabled(vcpu->kvm))
+		return;
+	if (pending_now)
+		kvmppc_set_int_pending(vcpu, 1);
+	else if (old_pending)
+		kvmppc_set_int_pending(vcpu, 0);
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	ulong crit_raw;
+	ulong crit_r1;
+	bool crit;
+
+	if (is_kvmppc_hv_enabled(vcpu->kvm))
+		return false;
+
+	crit_raw = kvmppc_get_critical(vcpu);
+	crit_r1 = kvmppc_get_gpr(vcpu, 1);
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR);
+
+	return crit;
+}
+
+void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
+{
+	vcpu->kvm->arch.kvm_ops->inject_interrupt(vcpu, vec, flags);
+}
+
+static int kvmppc_book3s_vec2irqprio(unsigned int vec)
+{
+	unsigned int prio;
+
+	switch (vec) {
+	case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET;		break;
+	case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK;	break;
+	case 0x300: prio = BOOK3S_IRQPRIO_DATA_STORAGE;		break;
+	case 0x380: prio = BOOK3S_IRQPRIO_DATA_SEGMENT;		break;
+	case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;		break;
+	case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;		break;
+	case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;		break;
+	case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;		break;
+	case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;		break;
+	case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;		break;
+	case 0x900: prio = BOOK3S_IRQPRIO_DECREMENTER;		break;
+	case 0xc00: prio = BOOK3S_IRQPRIO_SYSCALL;		break;
+	case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;		break;
+	case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;		break;
+	case 0xf40: prio = BOOK3S_IRQPRIO_VSX;			break;
+	case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL;		break;
+	default:    prio = BOOK3S_IRQPRIO_MAX;			break;
+	}
+
+	return prio;
+}
+
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+					  unsigned int vec)
+{
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
+
+	clear_bit(kvmppc_book3s_vec2irqprio(vec),
+		  &vcpu->arch.pending_exceptions);
+
+	kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
+				  old_pending);
+}
+
+void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+{
+	vcpu->stat.queue_intr++;
+
+	set_bit(kvmppc_book3s_vec2irqprio(vec),
+		&vcpu->arch.pending_exceptions);
+#ifdef EXIT_DEBUG
+	printk(KERN_INFO "Queueing interrupt %x\n", vec);
+#endif
+}
+EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);
+
+void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, srr1_flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check);
+
+void kvmppc_core_queue_syscall(struct kvm_vcpu *vcpu)
+{
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_SYSCALL, 0);
+}
+EXPORT_SYMBOL(kvmppc_core_queue_syscall);
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, srr1_flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);
+
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, srr1_flags);
+}
+
+void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_ALTIVEC, srr1_flags);
+}
+
+void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_VSX, srr1_flags);
+}
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec);
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+	return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec);
+
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+	/*
+	 * This case (KVM_INTERRUPT_SET) should never actually arise for
+	 * a pseries guest (because pseries guests expect their interrupt
+	 * controllers to continue asserting an external interrupt request
+	 * until it is acknowledged at the interrupt controller), but is
+	 * included to avoid ABI breakage and potentially for other
+	 * sorts of guest.
+	 *
+	 * There is a subtlety here: HV KVM does not test the
+	 * external_oneshot flag in the code that synthesizes
+	 * external interrupts for the guest just before entering
+	 * the guest.  That is OK even if userspace did do a
+	 * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
+	 * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
+	 * which ends up doing a smp_send_reschedule(), which will
+	 * pull the guest all the way out to the host, meaning that
+	 * we will call kvmppc_core_prepare_to_enter() before entering
+	 * the guest again, and that will handle the external_oneshot
+	 * flag correctly.
+	 */
+	if (irq->irq == KVM_INTERRUPT_SET)
+		vcpu->arch.external_oneshot = 1;
+
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+}
+
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
+{
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+}
+
+void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong srr1_flags,
+				    ulong dar, ulong dsisr)
+{
+	kvmppc_set_dar(vcpu, dar);
+	kvmppc_set_dsisr(vcpu, dsisr);
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, srr1_flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage);
+
+void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, srr1_flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_inst_storage);
+
+static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
+					 unsigned int priority)
+{
+	int deliver = 1;
+	int vec = 0;
+	bool crit = kvmppc_critical_section(vcpu);
+
+	switch (priority) {
+	case BOOK3S_IRQPRIO_DECREMENTER:
+		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
+		vec = BOOK3S_INTERRUPT_DECREMENTER;
+		break;
+	case BOOK3S_IRQPRIO_EXTERNAL:
+		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
+		vec = BOOK3S_INTERRUPT_EXTERNAL;
+		break;
+	case BOOK3S_IRQPRIO_SYSTEM_RESET:
+		vec = BOOK3S_INTERRUPT_SYSTEM_RESET;
+		break;
+	case BOOK3S_IRQPRIO_MACHINE_CHECK:
+		vec = BOOK3S_INTERRUPT_MACHINE_CHECK;
+		break;
+	case BOOK3S_IRQPRIO_DATA_STORAGE:
+		vec = BOOK3S_INTERRUPT_DATA_STORAGE;
+		break;
+	case BOOK3S_IRQPRIO_INST_STORAGE:
+		vec = BOOK3S_INTERRUPT_INST_STORAGE;
+		break;
+	case BOOK3S_IRQPRIO_DATA_SEGMENT:
+		vec = BOOK3S_INTERRUPT_DATA_SEGMENT;
+		break;
+	case BOOK3S_IRQPRIO_INST_SEGMENT:
+		vec = BOOK3S_INTERRUPT_INST_SEGMENT;
+		break;
+	case BOOK3S_IRQPRIO_ALIGNMENT:
+		vec = BOOK3S_INTERRUPT_ALIGNMENT;
+		break;
+	case BOOK3S_IRQPRIO_PROGRAM:
+		vec = BOOK3S_INTERRUPT_PROGRAM;
+		break;
+	case BOOK3S_IRQPRIO_VSX:
+		vec = BOOK3S_INTERRUPT_VSX;
+		break;
+	case BOOK3S_IRQPRIO_ALTIVEC:
+		vec = BOOK3S_INTERRUPT_ALTIVEC;
+		break;
+	case BOOK3S_IRQPRIO_FP_UNAVAIL:
+		vec = BOOK3S_INTERRUPT_FP_UNAVAIL;
+		break;
+	case BOOK3S_IRQPRIO_SYSCALL:
+		vec = BOOK3S_INTERRUPT_SYSCALL;
+		break;
+	case BOOK3S_IRQPRIO_DEBUG:
+		vec = BOOK3S_INTERRUPT_TRACE;
+		break;
+	case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:
+		vec = BOOK3S_INTERRUPT_PERFMON;
+		break;
+	case BOOK3S_IRQPRIO_FAC_UNAVAIL:
+		vec = BOOK3S_INTERRUPT_FAC_UNAVAIL;
+		break;
+	default:
+		deliver = 0;
+		printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority);
+		break;
+	}
+
+#if 0
+	printk(KERN_INFO "Deliver interrupt 0x%x? %x\n", vec, deliver);
+#endif
+
+	if (deliver)
+		kvmppc_inject_interrupt(vcpu, vec, 0);
+
+	return deliver;
+}
+
+/*
+ * This function determines if an irqprio should be cleared once issued.
+ */
+static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	switch (priority) {
+		case BOOK3S_IRQPRIO_DECREMENTER:
+			/* DEC interrupts get cleared by mtdec */
+			return false;
+		case BOOK3S_IRQPRIO_EXTERNAL:
+			/*
+			 * External interrupts get cleared by userspace
+			 * except when set by the KVM_INTERRUPT ioctl with
+			 * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
+			 */
+			if (vcpu->arch.external_oneshot) {
+				vcpu->arch.external_oneshot = 0;
+				return true;
+			}
+			return false;
+	}
+
+	return true;
+}
+
+int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
+	unsigned int priority;
+
+#ifdef EXIT_DEBUG
+	if (vcpu->arch.pending_exceptions)
+		printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions);
+#endif
+	priority = __ffs(*pending);
+	while (priority < BOOK3S_IRQPRIO_MAX) {
+		if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
+		    clear_irqprio(vcpu, priority)) {
+			clear_bit(priority, &vcpu->arch.pending_exceptions);
+			break;
+		}
+
+		priority = find_next_bit(pending,
+					 BITS_PER_BYTE * sizeof(*pending),
+					 priority + 1);
+	}
+
+	/* Tell the guest about our interrupt status */
+	kvmppc_update_int_pending(vcpu, *pending, old_pending);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter);
+
+kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
+			bool *writable)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF))
+		mp_pa = (uint32_t)mp_pa;
+
+	/* Magic page override */
+	gpa &= ~0xFFFULL;
+	if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) {
+		ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+		kvm_pfn_t pfn;
+
+		pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
+		get_page(pfn_to_page(pfn));
+		if (writable)
+			*writable = true;
+		return pfn;
+	}
+
+	return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable);
+}
+EXPORT_SYMBOL_GPL(kvmppc_gpa_to_pfn);
+
+int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid,
+		 enum xlate_readwrite xlrw, struct kvmppc_pte *pte)
+{
+	bool data = (xlid == XLATE_DATA);
+	bool iswrite = (xlrw == XLATE_WRITE);
+	int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR));
+	int r;
+
+	if (relocated) {
+		r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite);
+	} else {
+		pte->eaddr = eaddr;
+		pte->raddr = eaddr & KVM_PAM;
+		pte->vpage = VSID_REAL | eaddr >> 12;
+		pte->may_read = true;
+		pte->may_write = true;
+		pte->may_execute = true;
+		r = 0;
+
+		if ((kvmppc_get_msr(vcpu) & (MSR_IR | MSR_DR)) == MSR_DR &&
+		    !data) {
+			if ((vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) &&
+			    ((eaddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS))
+			pte->raddr &= ~SPLIT_HACK_MASK;
+		}
+	}
+
+	return r;
+}
+
+/*
+ * Returns prefixed instructions with the prefix in the high 32 bits
+ * of *inst and suffix in the low 32 bits.  This is the same convention
+ * as used in HEIR, vcpu->arch.last_inst and vcpu->arch.emul_inst.
+ * Like vcpu->arch.last_inst but unlike vcpu->arch.emul_inst, each
+ * half of the value needs byte-swapping if the guest endianness is
+ * different from the host endianness.
+ */
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+		enum instruction_fetch_type type, unsigned long *inst)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+	int r;
+	u32 iw;
+
+	if (type == INST_SC)
+		pc -= 4;
+
+	r = kvmppc_ld(vcpu, &pc, sizeof(u32), &iw, false);
+	if (r != EMULATE_DONE)
+		return EMULATE_AGAIN;
+	/*
+	 * If [H]SRR1 indicates that the instruction that caused the
+	 * current interrupt is a prefixed instruction, get the suffix.
+	 */
+	if (kvmppc_get_msr(vcpu) & SRR1_PREFIXED) {
+		u32 suffix;
+		pc += 4;
+		r = kvmppc_ld(vcpu, &pc, sizeof(u32), &suffix, false);
+		if (r != EMULATE_DONE)
+			return EMULATE_AGAIN;
+		*inst = ((u64)iw << 32) | suffix;
+	} else {
+		*inst = iw;
+	}
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_load_last_inst);
+
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	int ret;
+
+	vcpu_load(vcpu);
+	ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+	vcpu_put(vcpu);
+
+	return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	int ret;
+
+	vcpu_load(vcpu);
+	ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+	vcpu_put(vcpu);
+
+	return ret;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	regs->pc = kvmppc_get_pc(vcpu);
+	regs->cr = kvmppc_get_cr(vcpu);
+	regs->ctr = kvmppc_get_ctr(vcpu);
+	regs->lr = kvmppc_get_lr(vcpu);
+	regs->xer = kvmppc_get_xer(vcpu);
+	regs->msr = kvmppc_get_msr(vcpu);
+	regs->srr0 = kvmppc_get_srr0(vcpu);
+	regs->srr1 = kvmppc_get_srr1(vcpu);
+	regs->pid = vcpu->arch.pid;
+	regs->sprg0 = kvmppc_get_sprg0(vcpu);
+	regs->sprg1 = kvmppc_get_sprg1(vcpu);
+	regs->sprg2 = kvmppc_get_sprg2(vcpu);
+	regs->sprg3 = kvmppc_get_sprg3(vcpu);
+	regs->sprg4 = kvmppc_get_sprg4(vcpu);
+	regs->sprg5 = kvmppc_get_sprg5(vcpu);
+	regs->sprg6 = kvmppc_get_sprg6(vcpu);
+	regs->sprg7 = kvmppc_get_sprg7(vcpu);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	kvmppc_set_pc(vcpu, regs->pc);
+	kvmppc_set_cr(vcpu, regs->cr);
+	kvmppc_set_ctr(vcpu, regs->ctr);
+	kvmppc_set_lr(vcpu, regs->lr);
+	kvmppc_set_xer(vcpu, regs->xer);
+	kvmppc_set_msr(vcpu, regs->msr);
+	kvmppc_set_srr0(vcpu, regs->srr0);
+	kvmppc_set_srr1(vcpu, regs->srr1);
+	kvmppc_set_sprg0(vcpu, regs->sprg0);
+	kvmppc_set_sprg1(vcpu, regs->sprg1);
+	kvmppc_set_sprg2(vcpu, regs->sprg2);
+	kvmppc_set_sprg3(vcpu, regs->sprg3);
+	kvmppc_set_sprg4(vcpu, regs->sprg4);
+	kvmppc_set_sprg5(vcpu, regs->sprg5);
+	kvmppc_set_sprg6(vcpu, regs->sprg6);
+	kvmppc_set_sprg7(vcpu, regs->sprg7);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -EOPNOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -EOPNOTSUPP;
+}
+
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (id) {
+		case KVM_REG_PPC_DAR:
+			*val = get_reg_val(id, kvmppc_get_dar(vcpu));
+			break;
+		case KVM_REG_PPC_DSISR:
+			*val = get_reg_val(id, kvmppc_get_dsisr(vcpu));
+			break;
+		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+			i = id - KVM_REG_PPC_FPR0;
+			*val = get_reg_val(id, VCPU_FPR(vcpu, i));
+			break;
+		case KVM_REG_PPC_FPSCR:
+			*val = get_reg_val(id, vcpu->arch.fp.fpscr);
+			break;
+#ifdef CONFIG_VSX
+		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+			if (cpu_has_feature(CPU_FTR_VSX)) {
+				i = id - KVM_REG_PPC_VSR0;
+				val->vsxval[0] = vcpu->arch.fp.fpr[i][0];
+				val->vsxval[1] = vcpu->arch.fp.fpr[i][1];
+			} else {
+				r = -ENXIO;
+			}
+			break;
+#endif /* CONFIG_VSX */
+		case KVM_REG_PPC_DEBUG_INST:
+			*val = get_reg_val(id, INS_TW);
+			break;
+#ifdef CONFIG_KVM_XICS
+		case KVM_REG_PPC_ICP_STATE:
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
+				r = -ENXIO;
+				break;
+			}
+			if (xics_on_xive())
+				*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
+			else
+				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
+			break;
+#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+		case KVM_REG_PPC_VP_STATE:
+			if (!vcpu->arch.xive_vcpu) {
+				r = -ENXIO;
+				break;
+			}
+			if (xive_enabled())
+				r = kvmppc_xive_native_get_vp(vcpu, val);
+			else
+				r = -ENXIO;
+			break;
+#endif /* CONFIG_KVM_XIVE */
+		case KVM_REG_PPC_FSCR:
+			*val = get_reg_val(id, vcpu->arch.fscr);
+			break;
+		case KVM_REG_PPC_TAR:
+			*val = get_reg_val(id, vcpu->arch.tar);
+			break;
+		case KVM_REG_PPC_EBBHR:
+			*val = get_reg_val(id, vcpu->arch.ebbhr);
+			break;
+		case KVM_REG_PPC_EBBRR:
+			*val = get_reg_val(id, vcpu->arch.ebbrr);
+			break;
+		case KVM_REG_PPC_BESCR:
+			*val = get_reg_val(id, vcpu->arch.bescr);
+			break;
+		case KVM_REG_PPC_IC:
+			*val = get_reg_val(id, vcpu->arch.ic);
+			break;
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (id) {
+		case KVM_REG_PPC_DAR:
+			kvmppc_set_dar(vcpu, set_reg_val(id, *val));
+			break;
+		case KVM_REG_PPC_DSISR:
+			kvmppc_set_dsisr(vcpu, set_reg_val(id, *val));
+			break;
+		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+			i = id - KVM_REG_PPC_FPR0;
+			VCPU_FPR(vcpu, i) = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_FPSCR:
+			vcpu->arch.fp.fpscr = set_reg_val(id, *val);
+			break;
+#ifdef CONFIG_VSX
+		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+			if (cpu_has_feature(CPU_FTR_VSX)) {
+				i = id - KVM_REG_PPC_VSR0;
+				vcpu->arch.fp.fpr[i][0] = val->vsxval[0];
+				vcpu->arch.fp.fpr[i][1] = val->vsxval[1];
+			} else {
+				r = -ENXIO;
+			}
+			break;
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_KVM_XICS
+		case KVM_REG_PPC_ICP_STATE:
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
+				r = -ENXIO;
+				break;
+			}
+			if (xics_on_xive())
+				r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
+			else
+				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
+			break;
+#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+		case KVM_REG_PPC_VP_STATE:
+			if (!vcpu->arch.xive_vcpu) {
+				r = -ENXIO;
+				break;
+			}
+			if (xive_enabled())
+				r = kvmppc_xive_native_set_vp(vcpu, val);
+			else
+				r = -ENXIO;
+			break;
+#endif /* CONFIG_KVM_XIVE */
+		case KVM_REG_PPC_FSCR:
+			vcpu->arch.fscr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_TAR:
+			vcpu->arch.tar = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_EBBHR:
+			vcpu->arch.ebbhr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_EBBRR:
+			vcpu->arch.ebbrr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_BESCR:
+			vcpu->arch.bescr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_IC:
+			vcpu->arch.ic = set_reg_val(id, *val);
+			break;
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvmppc_set_msr);
+
+int kvmppc_vcpu_run(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.kvm_ops->vcpu_run(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
+{
+	vcpu_load(vcpu);
+	vcpu->guest_debug = dbg->control;
+	vcpu_put(vcpu);
+	return 0;
+}
+
+void kvmppc_decrementer_func(struct kvm_vcpu *vcpu)
+{
+	kvmppc_core_queue_dec(vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.kvm_ops->check_requests(vcpu);
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+	return kvm->arch.kvm_ops->get_dirty_log(kvm, log);
+}
+
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	kvm->arch.kvm_ops->free_memslot(slot);
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+	kvm->arch.kvm_ops->flush_memslot(kvm, memslot);
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      const struct kvm_memory_slot *old,
+				      struct kvm_memory_slot *new,
+				      enum kvm_mr_change change)
+{
+	return kvm->arch.kvm_ops->prepare_memory_region(kvm, old, new, change);
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
+				enum kvm_mr_change change)
+{
+	kvm->arch.kvm_ops->commit_memory_region(kvm, old, new, change);
+}
+
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return kvm->arch.kvm_ops->age_gfn(kvm, range);
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
+}
+
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+
+#ifdef CONFIG_PPC64
+	INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
+	mutex_init(&kvm->arch.rtas_token_lock);
+#endif
+
+	return kvm->arch.kvm_ops->init_vm(kvm);
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvm->arch.kvm_ops->destroy_vm(kvm);
+
+#ifdef CONFIG_PPC64
+	kvmppc_rtas_tokens_free(kvm);
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+#endif
+
+#ifdef CONFIG_KVM_XICS
+	/*
+	 * Free the XIVE and XICS devices which are not directly freed by the
+	 * device 'release' method
+	 */
+	kfree(kvm->arch.xive_devices.native);
+	kvm->arch.xive_devices.native = NULL;
+	kfree(kvm->arch.xive_devices.xics_on_xive);
+	kvm->arch.xive_devices.xics_on_xive = NULL;
+	kfree(kvm->arch.xics_device);
+	kvm->arch.xics_device = NULL;
+#endif /* CONFIG_KVM_XICS */
+}
+
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	u64 buf;
+	int srcu_idx;
+	int ret;
+
+	if (!is_power_of_2(size) || (size > sizeof(buf)))
+		return H_TOO_HARD;
+
+	srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	switch (size) {
+	case 1:
+		kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+		break;
+
+	case 2:
+		kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+		break;
+
+	case 4:
+		kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+		break;
+
+	case 8:
+		kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+		break;
+
+	default:
+		BUG();
+	}
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	unsigned long val = kvmppc_get_gpr(vcpu, 6);
+	u64 buf;
+	int srcu_idx;
+	int ret;
+
+	switch (size) {
+	case 1:
+		*(u8 *)&buf = val;
+		break;
+
+	case 2:
+		*(__be16 *)&buf = cpu_to_be16(val);
+		break;
+
+	case 4:
+		*(__be32 *)&buf = cpu_to_be32(val);
+		break;
+
+	case 8:
+		*(__be64 *)&buf = cpu_to_be64(val);
+		break;
+
+	default:
+		return H_TOO_HARD;
+	}
+
+	srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
+int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
+{
+	return kvm->arch.kvm_ops->hcall_implemented(hcall);
+}
+
+#ifdef CONFIG_KVM_XICS
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	if (xics_on_xive())
+		return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+	else
+		return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
+			      struct kvm *kvm, int irq_source_id,
+			      int level, bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
+			   level, line_status);
+}
+static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e,
+				 struct kvm *kvm, int irq_source_id, int level,
+				 bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
+}
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
+{
+	entries->gsi = gsi;
+	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
+	entries->set = kvmppc_book3s_set_irq;
+	entries->irqchip.irqchip = 0;
+	entries->irqchip.pin = gsi;
+	return 1;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	return pin;
+}
+
+#endif /* CONFIG_KVM_XICS */
+
+static int kvmppc_book3s_init(void)
+{
+	int r;
+
+	r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	if (r)
+		return r;
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	r = kvmppc_book3s_init_pr();
+#endif
+
+#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_KVM_XIVE
+	if (xics_on_xive()) {
+		kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
+		if (kvmppc_xive_native_supported())
+			kvm_register_device_ops(&kvm_xive_native_ops,
+						KVM_DEV_TYPE_XIVE);
+	} else
+#endif
+		kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
+#endif
+	return r;
+}
+
+static void kvmppc_book3s_exit(void)
+{
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	kvmppc_book3s_exit_pr();
+#endif
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
+
+/* On 32bit this is our one and only kernel module */
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
+#endif
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
new file mode 100644
index 0000000000..58391b4b32
--- /dev/null
+++ b/arch/powerpc/kvm/book3s.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ */
+
+#ifndef __POWERPC_KVM_BOOK3S_H__
+#define __POWERPC_KVM_BOOK3S_H__
+
+extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+					 struct kvm_memory_slot *memslot);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+
+extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_pr(struct kvm_vcpu *vcpu,
+				     unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu,
+					int sprn, ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu,
+					int sprn, ulong *spr_val);
+extern int kvmppc_book3s_init_pr(void);
+void kvmppc_book3s_exit_pr(void);
+extern int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val);
+#else
+static inline void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) {}
+#endif
+
+extern void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr);
+extern void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags);
+
+#endif
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
new file mode 100644
index 0000000000..0215f32932
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_MMU_PTE */
+/* #define DEBUG_MMU_PTE_IP 0xfff14c40 */
+
+#ifdef DEBUG_MMU
+#define dprintk(X...) printk(KERN_INFO X)
+#else
+#define dprintk(X...) do { } while(0)
+#endif
+
+#ifdef DEBUG_MMU_PTE
+#define dprintk_pte(X...) printk(KERN_INFO X)
+#else
+#define dprintk_pte(X...) do { } while(0)
+#endif
+
+#define PTEG_FLAG_ACCESSED	0x00000100
+#define PTEG_FLAG_DIRTY		0x00000080
+#ifndef SID_SHIFT
+#define SID_SHIFT		28
+#endif
+
+static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
+{
+#ifdef DEBUG_MMU_PTE_IP
+	return vcpu->arch.regs.nip == DEBUG_MMU_PTE_IP;
+#else
+	return true;
+#endif
+}
+
+static inline u32 sr_vsid(u32 sr_raw)
+{
+	return sr_raw & 0x0fffffff;
+}
+
+static inline bool sr_valid(u32 sr_raw)
+{
+	return (sr_raw & 0x80000000) ? false : true;
+}
+
+static inline bool sr_ks(u32 sr_raw)
+{
+	return (sr_raw & 0x40000000) ? true: false;
+}
+
+static inline bool sr_kp(u32 sr_raw)
+{
+	return (sr_raw & 0x20000000) ? true: false;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
+					  struct kvmppc_pte *pte, bool data,
+					  bool iswrite);
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+					     u64 *vsid);
+
+static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf);
+}
+
+static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
+					 bool data)
+{
+	u64 vsid;
+	struct kvmppc_pte pte;
+
+	if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false))
+		return pte.vpage;
+
+	kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+	return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16);
+}
+
+static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu,
+				      u32 sre, gva_t eaddr,
+				      bool primary)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u32 page, hash, pteg, htabmask;
+	hva_t r;
+
+	page = (eaddr & 0x0FFFFFFF) >> 12;
+	htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0;
+
+	hash = ((sr_vsid(sre) ^ page) << 6);
+	if (!primary)
+		hash = ~hash;
+	hash &= htabmask;
+
+	pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
+
+	dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
+		kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg,
+		sr_vsid(sre));
+
+	r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
+	if (kvm_is_error_hva(r))
+		return r;
+	return r | (pteg & ~PAGE_MASK);
+}
+
+static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)
+{
+	return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) |
+	       (primary ? 0 : 0x40) | 0x80000000;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
+					  struct kvmppc_pte *pte, bool data,
+					  bool iswrite)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	struct kvmppc_bat *bat;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (data)
+			bat = &vcpu_book3s->dbat[i];
+		else
+			bat = &vcpu_book3s->ibat[i];
+
+		if (kvmppc_get_msr(vcpu) & MSR_PR) {
+			if (!bat->vp)
+				continue;
+		} else {
+			if (!bat->vs)
+				continue;
+		}
+
+		if (check_debug_ip(vcpu))
+		{
+			dprintk_pte("%cBAT %02d: 0x%lx - 0x%x (0x%x)\n",
+				    data ? 'd' : 'i', i, eaddr, bat->bepi,
+				    bat->bepi_mask);
+		}
+		if ((eaddr & bat->bepi_mask) == bat->bepi) {
+			u64 vsid;
+			kvmppc_mmu_book3s_32_esid_to_vsid(vcpu,
+				eaddr >> SID_SHIFT, &vsid);
+			vsid <<= 16;
+			pte->vpage = (((u64)eaddr >> 12) & 0xffff) | vsid;
+
+			pte->raddr = bat->brpn | (eaddr & ~bat->bepi_mask);
+			pte->may_read = bat->pp;
+			pte->may_write = bat->pp > 1;
+			pte->may_execute = true;
+			if (!pte->may_read) {
+				printk(KERN_INFO "BAT is not readable!\n");
+				continue;
+			}
+			if (iswrite && !pte->may_write) {
+				dprintk_pte("BAT is read-only!\n");
+				continue;
+			}
+
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
+				     struct kvmppc_pte *pte, bool data,
+				     bool iswrite, bool primary)
+{
+	u32 sre;
+	hva_t ptegp;
+	u32 pteg[16];
+	u32 pte0, pte1;
+	u32 ptem = 0;
+	int i;
+	int found = 0;
+
+	sre = find_sr(vcpu, eaddr);
+
+	dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
+		    sr_vsid(sre), sre);
+
+	pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+
+	ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary);
+	if (kvm_is_error_hva(ptegp)) {
+		printk(KERN_INFO "KVM: Invalid PTEG!\n");
+		goto no_page_found;
+	}
+
+	ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary);
+
+	if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
+		printk_ratelimited(KERN_ERR
+			"KVM: Can't copy data from 0x%lx!\n", ptegp);
+		goto no_page_found;
+	}
+
+	for (i=0; i<16; i+=2) {
+		pte0 = be32_to_cpu(pteg[i]);
+		pte1 = be32_to_cpu(pteg[i + 1]);
+		if (ptem == pte0) {
+			u8 pp;
+
+			pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF);
+			pp = pte1 & 3;
+
+			if ((sr_kp(sre) &&  (kvmppc_get_msr(vcpu) & MSR_PR)) ||
+			    (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR)))
+				pp |= 4;
+
+			pte->may_write = false;
+			pte->may_read = false;
+			pte->may_execute = true;
+			switch (pp) {
+				case 0:
+				case 1:
+				case 2:
+				case 6:
+					pte->may_write = true;
+					fallthrough;
+				case 3:
+				case 5:
+				case 7:
+					pte->may_read = true;
+					break;
+			}
+
+			dprintk_pte("MMU: Found PTE -> %x %x - %x\n",
+				    pte0, pte1, pp);
+			found = 1;
+			break;
+		}
+	}
+
+	/* Update PTE C and A bits, so the guest's swapper knows we used the
+	   page */
+	if (found) {
+		u32 pte_r = pte1;
+		char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32));
+
+		/*
+		 * Use single-byte writes to update the HPTE, to
+		 * conform to what real hardware does.
+		 */
+		if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) {
+			pte_r |= PTEG_FLAG_ACCESSED;
+			put_user(pte_r >> 8, addr + 2);
+		}
+		if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) {
+			pte_r |= PTEG_FLAG_DIRTY;
+			put_user(pte_r, addr + 3);
+		}
+		if (!pte->may_read || (iswrite && !pte->may_write))
+			return -EPERM;
+		return 0;
+	}
+
+no_page_found:
+
+	if (check_debug_ip(vcpu)) {
+		dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n",
+			    to_book3s(vcpu)->sdr1, ptegp);
+		for (i=0; i<16; i+=2) {
+			dprintk_pte("   %02d: 0x%x - 0x%x (0x%x)\n",
+				    i, be32_to_cpu(pteg[i]),
+				    be32_to_cpu(pteg[i+1]), ptem);
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				      struct kvmppc_pte *pte, bool data,
+				      bool iswrite)
+{
+	int r;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	pte->eaddr = eaddr;
+	pte->page_size = MMU_PAGE_4K;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+		pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);
+		pte->raddr &= KVM_PAM;
+		pte->may_execute = true;
+		pte->may_read = true;
+		pte->may_write = true;
+
+		return 0;
+	}
+
+	r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite);
+	if (r < 0)
+		r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte,
+						   data, iswrite, true);
+	if (r == -ENOENT)
+		r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte,
+						   data, iswrite, false);
+
+	return r;
+}
+
+
+static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
+{
+	return kvmppc_get_sr(vcpu, srnum);
+}
+
+static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
+					ulong value)
+{
+	kvmppc_set_sr(vcpu, srnum, value);
+	kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
+}
+
+static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large)
+{
+	unsigned long i;
+	struct kvm_vcpu *v;
+
+	/* flush this VA on all cpus */
+	kvm_for_each_vcpu(i, v, vcpu->kvm)
+		kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000);
+}
+
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+					     u64 *vsid)
+{
+	ulong ea = esid << SID_SHIFT;
+	u32 sr;
+	u64 gvsid = esid;
+	u64 msr = kvmppc_get_msr(vcpu);
+
+	if (msr & (MSR_DR|MSR_IR)) {
+		sr = find_sr(vcpu, ea);
+		if (sr_valid(sr))
+			gvsid = sr_vsid(sr);
+	}
+
+	/* In case we only have one of MSR_IR or MSR_DR set, let's put
+	   that in the real-mode context (and hope RM doesn't access
+	   high memory) */
+	switch (msr & (MSR_DR|MSR_IR)) {
+	case 0:
+		*vsid = VSID_REAL | esid;
+		break;
+	case MSR_IR:
+		*vsid = VSID_REAL_IR | gvsid;
+		break;
+	case MSR_DR:
+		*vsid = VSID_REAL_DR | gvsid;
+		break;
+	case MSR_DR|MSR_IR:
+		if (sr_valid(sr))
+			*vsid = sr_vsid(sr);
+		else
+			*vsid = VSID_BAT | gvsid;
+		break;
+	default:
+		BUG();
+	}
+
+	if (msr & MSR_PR)
+		*vsid |= VSID_PR;
+
+	return 0;
+}
+
+static bool kvmppc_mmu_book3s_32_is_dcbz32(struct kvm_vcpu *vcpu)
+{
+	return true;
+}
+
+
+void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin;
+	mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin;
+	mmu->xlate = kvmppc_mmu_book3s_32_xlate;
+	mmu->tlbie = kvmppc_mmu_book3s_32_tlbie;
+	mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid;
+	mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp;
+	mmu->is_dcbz32 = kvmppc_mmu_book3s_32_is_dcbz32;
+
+	mmu->slbmte = NULL;
+	mmu->slbmfee = NULL;
+	mmu->slbmfev = NULL;
+	mmu->slbfee = NULL;
+	mmu->slbie = NULL;
+	mmu->slbia = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
new file mode 100644
index 0000000000..4b3a8d80cf
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/book3s/32/mmu-hash.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+#include "book3s.h"
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_SR */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+#ifdef DEBUG_SR
+#define dprintk_sr(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_sr(a, ...) do { } while(0)
+#endif
+
+#if PAGE_SHIFT != 12
+#error Unknown page size
+#endif
+
+#ifdef CONFIG_SMP
+#error XXX need to grab mmu_hash_lock
+#endif
+
+#ifdef CONFIG_PTE_64BIT
+#error Only 32 bit pages are supported for now
+#endif
+
+static ulong htab;
+static u32 htabmask;
+
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	volatile u32 *pteg;
+
+	/* Remove from host HTAB */
+	pteg = (u32*)pte->slot;
+	pteg[0] = 0;
+
+	/* And make sure it's gone from the TLB too */
+	asm volatile ("sync");
+	asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
+	asm volatile ("sync");
+	asm volatile ("tlbsync");
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	u16 sid_map_mask;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+	if (map->guest_vsid == gvsid) {
+		dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n",
+			    gvsid, map->host_vsid);
+		return map;
+	}
+
+	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+	if (map->guest_vsid == gvsid) {
+		dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n",
+			    gvsid, map->host_vsid);
+		return map;
+	}
+
+	dprintk_sr("SR: Searching 0x%llx -> not found\n", gvsid);
+	return NULL;
+}
+
+static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr,
+				bool primary)
+{
+	u32 page, hash;
+	ulong pteg = htab;
+
+	page = (eaddr & ~ESID_MASK) >> 12;
+
+	hash = ((vsid ^ page) << 6);
+	if (!primary)
+		hash = ~hash;
+
+	hash &= htabmask;
+
+	pteg |= hash;
+
+	dprintk_mmu("htab: %lx | hash: %x | htabmask: %x | pteg: %lx\n",
+		htab, hash, htabmask, pteg);
+
+	return (u32*)pteg;
+}
+
+extern char etext[];
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
+			bool iswrite)
+{
+	kvm_pfn_t hpaddr;
+	u64 vpn;
+	u64 vsid;
+	struct kvmppc_sid_map *map;
+	volatile u32 *pteg;
+	u32 eaddr = orig_pte->eaddr;
+	u32 pteg0, pteg1;
+	register int rr = 0;
+	bool primary = false;
+	bool evict = false;
+	struct hpte_cache *pte;
+	int r = 0;
+	bool writable;
+
+	/* Get host physical address for gpa */
+	hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable);
+	if (is_error_noslot_pfn(hpaddr)) {
+		printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n",
+				 orig_pte->raddr);
+		r = -EINVAL;
+		goto out;
+	}
+	hpaddr <<= PAGE_SHIFT;
+
+	/* and write the mapping ea -> hpa into the pt */
+	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+	map = find_sid_vsid(vcpu, vsid);
+	if (!map) {
+		kvmppc_mmu_map_segment(vcpu, eaddr);
+		map = find_sid_vsid(vcpu, vsid);
+	}
+	BUG_ON(!map);
+
+	vsid = map->host_vsid;
+	vpn = (vsid << (SID_SHIFT - VPN_SHIFT)) |
+		((eaddr & ~ESID_MASK) >> VPN_SHIFT);
+next_pteg:
+	if (rr == 16) {
+		primary = !primary;
+		evict = true;
+		rr = 0;
+	}
+
+	pteg = kvmppc_mmu_get_pteg(vcpu, vsid, eaddr, primary);
+
+	/* not evicting yet */
+	if (!evict && (pteg[rr] & PTE_V)) {
+		rr += 2;
+		goto next_pteg;
+	}
+
+	dprintk_mmu("KVM: old PTEG: %p (%d)\n", pteg, rr);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[0], pteg[1]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[2], pteg[3]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[4], pteg[5]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[6], pteg[7]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[8], pteg[9]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[10], pteg[11]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[12], pteg[13]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[14], pteg[15]);
+
+	pteg0 = ((eaddr & 0x0fffffff) >> 22) | (vsid << 7) | PTE_V |
+		(primary ? 0 : PTE_SEC);
+	pteg1 = hpaddr | PTE_M | PTE_R | PTE_C;
+
+	if (orig_pte->may_write && writable) {
+		pteg1 |= PP_RWRW;
+		mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+	} else {
+		pteg1 |= PP_RWRX;
+	}
+
+	if (orig_pte->may_execute)
+		kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT);
+
+	local_irq_disable();
+
+	if (pteg[rr]) {
+		pteg[rr] = 0;
+		asm volatile ("sync");
+	}
+	pteg[rr + 1] = pteg1;
+	pteg[rr] = pteg0;
+	asm volatile ("sync");
+
+	local_irq_enable();
+
+	dprintk_mmu("KVM: new PTEG: %p\n", pteg);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[0], pteg[1]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[2], pteg[3]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[4], pteg[5]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[6], pteg[7]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[8], pteg[9]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[10], pteg[11]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[12], pteg[13]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[14], pteg[15]);
+
+
+	/* Now tell our Shadow PTE code about the new page */
+
+	pte = kvmppc_mmu_hpte_cache_next(vcpu);
+	if (!pte) {
+		kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
+		r = -EAGAIN;
+		goto out;
+	}
+
+	dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
+		    orig_pte->may_write ? 'w' : '-',
+		    orig_pte->may_execute ? 'x' : '-',
+		    orig_pte->eaddr, (ulong)pteg, vpn,
+		    orig_pte->vpage, hpaddr);
+
+	pte->slot = (ulong)&pteg[rr];
+	pte->host_vpn = vpn;
+	pte->pte = *orig_pte;
+	pte->pfn = hpaddr >> PAGE_SHIFT;
+
+	kvmppc_mmu_hpte_cache_map(vcpu, pte);
+
+	kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
+out:
+	return r;
+}
+
+void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL);
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u16 sid_map_mask;
+	static int backwards_map = 0;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	/* We might get collisions that trap in preceding order, so let's
+	   map them differently */
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	if (backwards_map)
+		sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+	/* Make sure we're taking the other map next time */
+	backwards_map = !backwards_map;
+
+	/* Uh-oh ... out of mappings. Let's flush! */
+	if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) {
+		vcpu_book3s->vsid_next = 0;
+		memset(vcpu_book3s->sid_map, 0,
+		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+	}
+	map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next];
+	vcpu_book3s->vsid_next++;
+
+	map->guest_vsid = gvsid;
+	map->valid = true;
+
+	return map;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+	u32 esid = eaddr >> SID_SHIFT;
+	u64 gvsid;
+	u32 sr;
+	struct kvmppc_sid_map *map;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	int r = 0;
+
+	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+		/* Invalidate an entry */
+		svcpu->sr[esid] = SR_INVALID;
+		r = -ENOENT;
+		goto out;
+	}
+
+	map = find_sid_vsid(vcpu, gvsid);
+	if (!map)
+		map = create_sid_map(vcpu, gvsid);
+
+	map->guest_esid = esid;
+	sr = map->host_vsid | SR_KP;
+	svcpu->sr[esid] = sr;
+
+	dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr);
+
+out:
+	svcpu_put(svcpu);
+	return r;
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+
+	dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));
+	for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)
+		svcpu->sr[i] = SR_INVALID;
+
+	svcpu_put(svcpu);
+}
+
+void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	kvmppc_mmu_hpte_destroy(vcpu);
+	preempt_disable();
+	for (i = 0; i < SID_CONTEXTS; i++)
+		__destroy_context(to_book3s(vcpu)->context_id[i]);
+	preempt_enable();
+}
+
+int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int err;
+	ulong sdr1;
+	int i;
+	int j;
+
+	for (i = 0; i < SID_CONTEXTS; i++) {
+		err = __init_new_context();
+		if (err < 0)
+			goto init_fail;
+		vcpu3s->context_id[i] = err;
+
+		/* Remember context id for this combination */
+		for (j = 0; j < 16; j++)
+			vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j);
+	}
+
+	vcpu3s->vsid_next = 0;
+
+	/* Remember where the HTAB is */
+	asm ( "mfsdr1 %0" : "=r"(sdr1) );
+	htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
+	htab = (ulong)__va(sdr1 & 0xffff0000);
+
+	kvmppc_mmu_hpte_init(vcpu);
+
+	return 0;
+
+init_fail:
+	for (j = 0; j < i; j++) {
+		if (!vcpu3s->context_id[j])
+			continue;
+
+		__destroy_context(to_book3s(vcpu)->context_id[j]);
+	}
+
+	return -1;
+}
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
new file mode 100644
index 0000000000..6cfcd20d46
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_GUEST_SEGMENTS
+
+	/* Required state:
+	 *
+	 * MSR = ~IR|DR
+	 * R1 = host R1
+	 * R2 = host R2
+	 * R3 = shadow vcpu
+	 * all other volatile GPRS = free except R4, R6
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 */
+
+#define XCHG_SR(n)	lwz	r9, (SVCPU_SR+(n*4))(r3);  \
+			mtsr	n, r9
+
+	XCHG_SR(0)
+	XCHG_SR(1)
+	XCHG_SR(2)
+	XCHG_SR(3)
+	XCHG_SR(4)
+	XCHG_SR(5)
+	XCHG_SR(6)
+	XCHG_SR(7)
+	XCHG_SR(8)
+	XCHG_SR(9)
+	XCHG_SR(10)
+	XCHG_SR(11)
+	XCHG_SR(12)
+	XCHG_SR(13)
+	XCHG_SR(14)
+	XCHG_SR(15)
+
+	/* Clear BATs. */
+
+#define KVM_KILL_BAT(n, reg)		\
+        mtspr   SPRN_IBAT##n##U,reg;	\
+        mtspr   SPRN_IBAT##n##L,reg;	\
+        mtspr   SPRN_DBAT##n##U,reg;	\
+        mtspr   SPRN_DBAT##n##L,reg;	\
+
+        li	r9, 0
+	KVM_KILL_BAT(0, r9)
+	KVM_KILL_BAT(1, r9)
+	KVM_KILL_BAT(2, r9)
+	KVM_KILL_BAT(3, r9)
+
+.endm
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_HOST_SEGMENTS
+
+	/* Register usage at this point:
+	 *
+	 * R1         = host R1
+	 * R2         = host R2
+	 * R12        = exit handler id
+	 * R13        = shadow vcpu - SHADOW_VCPU_OFF
+	 * SVCPU.*    = guest *
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 *
+	 */
+
+	/* Restore BATs */
+
+	/* We only overwrite the upper part, so we only restoree
+	   the upper part. */
+#define KVM_LOAD_BAT(n, reg, RA, RB)	\
+	lwz	RA,(n*16)+0(reg);	\
+	lwz	RB,(n*16)+4(reg);	\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	mtspr	SPRN_IBAT##n##L,RB;	\
+	lwz	RA,(n*16)+8(reg);	\
+	lwz	RB,(n*16)+12(reg);	\
+	mtspr	SPRN_DBAT##n##U,RA;	\
+	mtspr	SPRN_DBAT##n##L,RB;	\
+
+	lis     r9, BATS@ha
+	addi    r9, r9, BATS@l
+	tophys(r9, r9)
+	KVM_LOAD_BAT(0, r9, r10, r11)
+	KVM_LOAD_BAT(1, r9, r10, r11)
+	KVM_LOAD_BAT(2, r9, r10, r11)
+	KVM_LOAD_BAT(3, r9, r10, r11)
+
+	/* Restore Segment Registers */
+
+	/* 0xc - 0xf */
+
+        li      r0, 4
+        mtctr   r0
+	LOAD_REG_IMMEDIATE(r3, 0x20000000 | (0x111 * 0xc))
+        lis     r4, 0xc000
+3:      mtsrin  r3, r4
+        addi    r3, r3, 0x111     /* increment VSID */
+        addis   r4, r4, 0x1000    /* address of next segment */
+        bdnz    3b
+
+	/* 0x0 - 0xb */
+
+	/* switch_mmu_context() needs paging, let's enable it */
+	mfmsr   r9
+	ori     r11, r9, MSR_DR
+	mtmsr   r11
+	sync
+
+	/* switch_mmu_context() clobbers r12, rescue it */
+	SAVE_GPR(12, r1)
+
+	/* Calling switch_mmu_context(<inv>, current->mm, <inv>); */
+	lwz	r4, MM(r2)
+	bl	switch_mmu_context
+
+	/* restore r12 */
+	REST_GPR(12, r1)
+
+	/* Disable paging again */
+	mfmsr   r9
+	li      r6, MSR_DR
+	andc    r9, r9, r6
+	mtmsr	r9
+	sync
+
+.endm
diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
new file mode 100644
index 0000000000..3b361af873
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_entry.S
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <linux/export.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+#include <asm/code-patching-asm.h>
+#include <asm/exception-64s.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/ultravisor-api.h>
+
+/*
+ * These are branched to from interrupt handlers in exception-64s.S which set
+ * IKVM_REAL or IKVM_VIRT, if HSTATE_IN_GUEST was found to be non-zero.
+ */
+
+/*
+ * This is a hcall, so register convention is as
+ * Documentation/powerpc/papr_hcalls.rst.
+ *
+ * This may also be a syscall from PR-KVM userspace that is to be
+ * reflected to the PR guest kernel, so registers may be set up for
+ * a system call rather than hcall. We don't currently clobber
+ * anything here, but the 0xc00 handler has already clobbered CTR
+ * and CR0, so PR-KVM can not support a guest kernel that preserves
+ * those registers across its system calls.
+ *
+ * The state of registers is as kvmppc_interrupt, except CFAR is not
+ * saved, R13 is not in SCRATCH0, and R10 does not contain the trap.
+ */
+.global	kvmppc_hcall
+.balign IFETCH_ALIGN_BYTES
+kvmppc_hcall:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	lbz	r10,HSTATE_IN_GUEST(r13)
+	cmpwi	r10,KVM_GUEST_MODE_HV_P9
+	beq	kvmppc_p9_exit_hcall
+#endif
+	ld	r10,PACA_EXGEN+EX_R13(r13)
+	SET_SCRATCH0(r10)
+	li	r10,0xc00
+	/* Now we look like kvmppc_interrupt */
+	li	r11,PACA_EXGEN
+	b	.Lgot_save_area
+
+/*
+ * KVM interrupt entry occurs after GEN_INT_ENTRY runs, and follows that
+ * call convention:
+ *
+ * guest R9-R13, CTR, CFAR, PPR saved in PACA EX_xxx save area
+ * guest (H)DAR, (H)DSISR are also in the save area for relevant interrupts
+ * guest R13 also saved in SCRATCH0
+ * R13		= PACA
+ * R11		= (H)SRR0
+ * R12		= (H)SRR1
+ * R9		= guest CR
+ * PPR is set to medium
+ *
+ * With the addition for KVM:
+ * R10		= trap vector
+ */
+.global	kvmppc_interrupt
+.balign IFETCH_ALIGN_BYTES
+kvmppc_interrupt:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	std	r10,HSTATE_SCRATCH0(r13)
+	lbz	r10,HSTATE_IN_GUEST(r13)
+	cmpwi	r10,KVM_GUEST_MODE_HV_P9
+	beq	kvmppc_p9_exit_interrupt
+	ld	r10,HSTATE_SCRATCH0(r13)
+#endif
+	li	r11,PACA_EXGEN
+	cmpdi	r10,0x200
+	bgt+	.Lgot_save_area
+	li	r11,PACA_EXMC
+	beq	.Lgot_save_area
+	li	r11,PACA_EXNMI
+.Lgot_save_area:
+	add	r11,r11,r13
+BEGIN_FTR_SECTION
+	ld	r12,EX_CFAR(r11)
+	std	r12,HSTATE_CFAR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+	ld	r12,EX_CTR(r11)
+	mtctr	r12
+BEGIN_FTR_SECTION
+	ld	r12,EX_PPR(r11)
+	std	r12,HSTATE_PPR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+	ld	r12,EX_R12(r11)
+	std	r12,HSTATE_SCRATCH0(r13)
+	sldi	r12,r9,32
+	or	r12,r12,r10
+	ld	r9,EX_R9(r11)
+	ld	r10,EX_R10(r11)
+	ld	r11,EX_R11(r11)
+
+	/*
+	 * Hcalls and other interrupts come here after normalising register
+	 * contents and save locations:
+	 *
+	 * R12		= (guest CR << 32) | interrupt vector
+	 * R13		= PACA
+	 * guest R12 saved in shadow HSTATE_SCRATCH0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+	std	r9,HSTATE_SCRATCH2(r13)
+	lbz	r9,HSTATE_IN_GUEST(r13)
+	cmpwi	r9,KVM_GUEST_MODE_SKIP
+	beq-	.Lmaybe_skip
+.Lno_skip:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	cmpwi	r9,KVM_GUEST_MODE_GUEST
+	beq	kvmppc_interrupt_pr
+#endif
+	b	kvmppc_interrupt_hv
+#else
+	b	kvmppc_interrupt_pr
+#endif
+
+/*
+ * "Skip" interrupts are part of a trick KVM uses a with hash guests to load
+ * the faulting instruction in guest memory from the hypervisor without
+ * walking page tables.
+ *
+ * When the guest takes a fault that requires the hypervisor to load the
+ * instruction (e.g., MMIO emulation), KVM is running in real-mode with HV=1
+ * and the guest MMU context loaded. It sets KVM_GUEST_MODE_SKIP, and sets
+ * MSR[DR]=1 while leaving MSR[IR]=0, so it continues to fetch HV instructions
+ * but loads and stores will access the guest context. This is used to load
+ * the faulting instruction using the faulting guest effective address.
+ *
+ * However the guest context may not be able to translate, or it may cause a
+ * machine check or other issue, which results in a fault in the host
+ * (even with KVM-HV).
+ *
+ * These faults come here because KVM_GUEST_MODE_SKIP was set, so if they
+ * are (or are likely) caused by that load, the instruction is skipped by
+ * just returning with the PC advanced +4, where it is noticed the load did
+ * not execute and it goes to the slow path which walks the page tables to
+ * read guest memory.
+ */
+.Lmaybe_skip:
+	cmpwi	r12,BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	1f
+	cmpwi	r12,BOOK3S_INTERRUPT_DATA_STORAGE
+	beq	1f
+	cmpwi	r12,BOOK3S_INTERRUPT_DATA_SEGMENT
+	beq	1f
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* HSRR interrupts get 2 added to interrupt number */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE | 0x2
+	beq	2f
+#endif
+	b	.Lno_skip
+1:	mfspr	r9,SPRN_SRR0
+	addi	r9,r9,4
+	mtspr	SPRN_SRR0,r9
+	ld	r12,HSTATE_SCRATCH0(r13)
+	ld	r9,HSTATE_SCRATCH2(r13)
+	GET_SCRATCH0(r13)
+	RFI_TO_KERNEL
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+2:	mfspr	r9,SPRN_HSRR0
+	addi	r9,r9,4
+	mtspr	SPRN_HSRR0,r9
+	ld	r12,HSTATE_SCRATCH0(r13)
+	ld	r9,HSTATE_SCRATCH2(r13)
+	GET_SCRATCH0(r13)
+	HRFI_TO_KERNEL
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+
+/* Stack frame offsets for kvmppc_p9_enter_guest */
+#define SFS			(144 + STACK_FRAME_MIN_SIZE)
+#define STACK_SLOT_NVGPRS	(SFS - 144)	/* 18 gprs */
+
+/*
+ * void kvmppc_p9_enter_guest(struct vcpu *vcpu);
+ *
+ * Enter the guest on a ISAv3.0 or later system.
+ */
+.balign	IFETCH_ALIGN_BYTES
+_GLOBAL(kvmppc_p9_enter_guest)
+EXPORT_SYMBOL_GPL(kvmppc_p9_enter_guest)
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+	stdu	r1,-SFS(r1)
+
+	std	r1,HSTATE_HOST_R1(r13)
+
+	mfcr	r4
+	stw	r4,SFS+8(r1)
+
+	reg = 14
+	.rept	18
+	std	reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+	reg = reg + 1
+	.endr
+
+	ld	r4,VCPU_LR(r3)
+	mtlr	r4
+	ld	r4,VCPU_CTR(r3)
+	mtctr	r4
+	ld	r4,VCPU_XER(r3)
+	mtspr	SPRN_XER,r4
+
+	ld	r1,VCPU_CR(r3)
+
+BEGIN_FTR_SECTION
+	ld	r4,VCPU_CFAR(r3)
+	mtspr	SPRN_CFAR,r4
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+	ld	r4,VCPU_PPR(r3)
+	mtspr	SPRN_PPR,r4
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	reg = 4
+	.rept	28
+	ld	reg,__VCPU_GPR(reg)(r3)
+	reg = reg + 1
+	.endr
+
+	ld	r4,VCPU_KVM(r3)
+	lbz	r4,KVM_SECURE_GUEST(r4)
+	cmpdi	r4,0
+	ld	r4,VCPU_GPR(R4)(r3)
+	bne	.Lret_to_ultra
+
+	mtcr	r1
+
+	ld	r0,VCPU_GPR(R0)(r3)
+	ld	r1,VCPU_GPR(R1)(r3)
+	ld	r2,VCPU_GPR(R2)(r3)
+	ld	r3,VCPU_GPR(R3)(r3)
+
+	HRFI_TO_GUEST
+	b	.
+
+	/*
+	 * Use UV_RETURN ultracall to return control back to the Ultravisor
+	 * after processing an hypercall or interrupt that was forwarded
+	 * (a.k.a. reflected) to the Hypervisor.
+	 *
+	 * All registers have already been reloaded except the ucall requires:
+	 *   R0 = hcall result
+	 *   R2 = SRR1, so UV can detect a synthesized interrupt (if any)
+	 *   R3 = UV_RETURN
+	 */
+.Lret_to_ultra:
+	mtcr	r1
+	ld	r1,VCPU_GPR(R1)(r3)
+
+	ld	r0,VCPU_GPR(R3)(r3)
+	mfspr	r2,SPRN_SRR1
+	LOAD_REG_IMMEDIATE(r3, UV_RETURN)
+	sc	2
+
+/*
+ * kvmppc_p9_exit_hcall and kvmppc_p9_exit_interrupt are branched to from
+ * above if the interrupt was taken for a guest that was entered via
+ * kvmppc_p9_enter_guest().
+ *
+ * The exit code recovers the host stack and vcpu pointer, saves all guest GPRs
+ * and CR, LR, XER as well as guest MSR and NIA into the VCPU, then re-
+ * establishes the host stack and registers to return from the
+ * kvmppc_p9_enter_guest() function, which saves CTR and other guest registers
+ * (SPRs and FP, VEC, etc).
+ */
+.balign	IFETCH_ALIGN_BYTES
+kvmppc_p9_exit_hcall:
+	mfspr	r11,SPRN_SRR0
+	mfspr	r12,SPRN_SRR1
+	li	r10,0xc00
+	std	r10,HSTATE_SCRATCH0(r13)
+
+.balign	IFETCH_ALIGN_BYTES
+kvmppc_p9_exit_interrupt:
+	/*
+	 * If set to KVM_GUEST_MODE_HV_P9 but we're still in the
+	 * hypervisor, that means we can't return from the entry stack.
+	 */
+	rldicl. r10,r12,64-MSR_HV_LG,63
+	bne-	kvmppc_p9_bad_interrupt
+
+	std     r1,HSTATE_SCRATCH1(r13)
+	std     r3,HSTATE_SCRATCH2(r13)
+	ld	r1,HSTATE_HOST_R1(r13)
+	ld	r3,HSTATE_KVM_VCPU(r13)
+
+	std	r9,VCPU_CR(r3)
+
+1:
+	std	r11,VCPU_PC(r3)
+	std	r12,VCPU_MSR(r3)
+
+	reg = 14
+	.rept	18
+	std	reg,__VCPU_GPR(reg)(r3)
+	reg = reg + 1
+	.endr
+
+	/* r1, r3, r9-r13 are saved to vcpu by C code */
+	std	r0,VCPU_GPR(R0)(r3)
+	std	r2,VCPU_GPR(R2)(r3)
+	reg = 4
+	.rept	5
+	std	reg,__VCPU_GPR(reg)(r3)
+	reg = reg + 1
+	.endr
+
+	LOAD_PACA_TOC()
+
+	mflr	r4
+	std	r4,VCPU_LR(r3)
+	mfspr	r4,SPRN_XER
+	std	r4,VCPU_XER(r3)
+
+	reg = 14
+	.rept	18
+	ld	reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+	reg = reg + 1
+	.endr
+
+	lwz	r4,SFS+8(r1)
+	mtcr	r4
+
+	/*
+	 * Flush the link stack here, before executing the first blr on the
+	 * way out of the guest.
+	 *
+	 * The link stack won't match coming out of the guest anyway so the
+	 * only cost is the flush itself. The call clobbers r0.
+	 */
+1:	nop
+	patch_site 1b patch__call_kvm_flush_link_stack_p9
+
+	addi	r1,r1,SFS
+	ld	r0,PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+/*
+ * Took an interrupt somewhere right before HRFID to guest, so registers are
+ * in a bad way. Return things hopefully enough to run host virtual code and
+ * run the Linux interrupt handler (SRESET or MCE) to print something useful.
+ *
+ * We could be really clever and save all host registers in known locations
+ * before setting HSTATE_IN_GUEST, then restoring them all here, and setting
+ * return address to a fixup that sets them up again. But that's a lot of
+ * effort for a small bit of code. Lots of other things to do first.
+ */
+kvmppc_p9_bad_interrupt:
+BEGIN_MMU_FTR_SECTION
+	/*
+	 * Hash host doesn't try to recover MMU (requires host SLB reload)
+	 */
+	b	.
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
+	/*
+	 * Clean up guest registers to give host a chance to run.
+	 */
+	li	r10,0
+	mtspr	SPRN_AMR,r10
+	mtspr	SPRN_IAMR,r10
+	mtspr	SPRN_CIABR,r10
+	mtspr	SPRN_DAWRX0,r10
+BEGIN_FTR_SECTION
+	mtspr	SPRN_DAWRX1,r10
+END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
+
+	/*
+	 * Switch to host MMU mode (don't have the real host PID but we aren't
+	 * going back to userspace).
+	 */
+	hwsync
+	isync
+
+	mtspr	SPRN_PID,r10
+
+	ld	r10, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_KVM(r10)
+	lwz	r10, KVM_HOST_LPID(r10)
+	mtspr	SPRN_LPID,r10
+
+	ld	r10, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_KVM(r10)
+	ld	r10, KVM_HOST_LPCR(r10)
+	mtspr	SPRN_LPCR,r10
+
+	isync
+
+	/*
+	 * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
+	 * MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
+	 */
+	li	r10,KVM_GUEST_MODE_NONE
+	stb	r10,HSTATE_IN_GUEST(r13)
+	li	r10,MSR_RI
+	andc	r12,r12,r10
+
+	/*
+	 * Go back to interrupt handler. MCE and SRESET have their specific
+	 * PACA save area so they should be used directly. They set up their
+	 * own stack. The other handlers all use EXGEN. They will use the
+	 * guest r1 if it looks like a kernel stack, so just load the
+	 * emergency stack and go to program check for all other interrupts.
+	 */
+	ld	r10,HSTATE_SCRATCH0(r13)
+	cmpwi	r10,BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	.Lcall_machine_check_common
+
+	cmpwi	r10,BOOK3S_INTERRUPT_SYSTEM_RESET
+	beq	.Lcall_system_reset_common
+
+	b	.
+
+.Lcall_machine_check_common:
+	b	machine_check_common
+
+.Lcall_system_reset_common:
+	b	system_reset_common
+#endif
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
new file mode 100644
index 0000000000..61290282fd
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -0,0 +1,670 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/book3s/64/mmu-hash.h>
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk(X...) printk(KERN_INFO X)
+#else
+#define dprintk(X...) do { } while(0)
+#endif
+
+static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
+				struct kvm_vcpu *vcpu,
+				gva_t eaddr)
+{
+	int i;
+	u64 esid = GET_ESID(eaddr);
+	u64 esid_1t = GET_ESID_1T(eaddr);
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		u64 cmp_esid = esid;
+
+		if (!vcpu->arch.slb[i].valid)
+			continue;
+
+		if (vcpu->arch.slb[i].tb)
+			cmp_esid = esid_1t;
+
+		if (vcpu->arch.slb[i].esid == cmp_esid)
+			return &vcpu->arch.slb[i];
+	}
+
+	dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
+		eaddr, esid, esid_1t);
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+	    if (vcpu->arch.slb[i].vsid)
+		dprintk("  %d: %c%c%c %llx %llx\n", i,
+			vcpu->arch.slb[i].valid ? 'v' : ' ',
+			vcpu->arch.slb[i].large ? 'l' : ' ',
+			vcpu->arch.slb[i].tb    ? 't' : ' ',
+			vcpu->arch.slb[i].esid,
+			vcpu->arch.slb[i].vsid);
+	}
+
+	return NULL;
+}
+
+static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe)
+{
+	return slbe->tb ? SID_SHIFT_1T : SID_SHIFT;
+}
+
+static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe)
+{
+	return (1ul << kvmppc_slb_sid_shift(slbe)) - 1;
+}
+
+static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr)
+{
+	eaddr &= kvmppc_slb_offset_mask(slb);
+
+	return (eaddr >> VPN_SHIFT) |
+		((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT));
+}
+
+static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
+					 bool data)
+{
+	struct kvmppc_slb *slb;
+
+	slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
+	if (!slb)
+		return 0;
+
+	return kvmppc_slb_calc_vpn(slb, eaddr);
+}
+
+static int mmu_pagesize(int mmu_pg)
+{
+	switch (mmu_pg) {
+	case MMU_PAGE_64K:
+		return 16;
+	case MMU_PAGE_16M:
+		return 24;
+	}
+	return 12;
+}
+
+static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
+{
+	return mmu_pagesize(slbe->base_page_size);
+}
+
+static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
+{
+	int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
+
+	return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);
+}
+
+static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu,
+				struct kvmppc_slb *slbe, gva_t eaddr,
+				bool second)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u64 hash, pteg, htabsize;
+	u32 ssize;
+	hva_t r;
+	u64 vpn;
+
+	htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1);
+
+	vpn = kvmppc_slb_calc_vpn(slbe, eaddr);
+	ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M;
+	hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize);
+	if (second)
+		hash = ~hash;
+	hash &= ((1ULL << 39ULL) - 1ULL);
+	hash &= htabsize;
+	hash <<= 7ULL;
+
+	pteg = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+	pteg |= hash;
+
+	dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
+		page, vcpu_book3s->sdr1, pteg, slbe->vsid);
+
+	/* When running a PAPR guest, SDR1 contains a HVA address instead
+           of a GPA */
+	if (vcpu->arch.papr_enabled)
+		r = pteg;
+	else
+		r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
+
+	if (kvm_is_error_hva(r))
+		return r;
+	return r | (pteg & ~PAGE_MASK);
+}
+
+static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)
+{
+	int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
+	u64 avpn;
+
+	avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
+	avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p);
+
+	if (p < 16)
+		avpn >>= ((80 - p) - 56) - 8;	/* 16 - p */
+	else
+		avpn <<= p - 16;
+
+	return avpn;
+}
+
+/*
+ * Return page size encoded in the second word of a HPTE, or
+ * -1 for an invalid encoding for the base page size indicated by
+ * the SLB entry.  This doesn't handle mixed pagesize segments yet.
+ */
+static int decode_pagesize(struct kvmppc_slb *slbe, u64 r)
+{
+	switch (slbe->base_page_size) {
+	case MMU_PAGE_64K:
+		if ((r & 0xf000) == 0x1000)
+			return MMU_PAGE_64K;
+		break;
+	case MMU_PAGE_16M:
+		if ((r & 0xff000) == 0)
+			return MMU_PAGE_16M;
+		break;
+	}
+	return -1;
+}
+
+static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				      struct kvmppc_pte *gpte, bool data,
+				      bool iswrite)
+{
+	struct kvmppc_slb *slbe;
+	hva_t ptegp;
+	u64 pteg[16];
+	u64 avpn = 0;
+	u64 r;
+	u64 v_val, v_mask;
+	u64 eaddr_mask;
+	int i;
+	u8 pp, key = 0;
+	bool found = false;
+	bool second = false;
+	int pgsize;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		gpte->eaddr = eaddr;
+		gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+		gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff);
+		gpte->raddr &= KVM_PAM;
+		gpte->may_execute = true;
+		gpte->may_read = true;
+		gpte->may_write = true;
+		gpte->page_size = MMU_PAGE_4K;
+		gpte->wimg = HPTE_R_M;
+
+		return 0;
+	}
+
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
+	if (!slbe)
+		goto no_seg_found;
+
+	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+	v_val = avpn & HPTE_V_AVPN;
+
+	if (slbe->tb)
+		v_val |= SLB_VSID_B_1T;
+	if (slbe->large)
+		v_val |= HPTE_V_LARGE;
+	v_val |= HPTE_V_VALID;
+
+	v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
+		HPTE_V_SECONDARY;
+
+	pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K;
+
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+
+do_second:
+	ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second);
+	if (kvm_is_error_hva(ptegp))
+		goto no_page_found;
+
+	if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
+		printk_ratelimited(KERN_ERR
+			"KVM: Can't copy data from 0x%lx!\n", ptegp);
+		goto no_page_found;
+	}
+
+	if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp)
+		key = 4;
+	else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks)
+		key = 4;
+
+	for (i=0; i<16; i+=2) {
+		u64 pte0 = be64_to_cpu(pteg[i]);
+		u64 pte1 = be64_to_cpu(pteg[i + 1]);
+
+		/* Check all relevant fields of 1st dword */
+		if ((pte0 & v_mask) == v_val) {
+			/* If large page bit is set, check pgsize encoding */
+			if (slbe->large &&
+			    (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
+				pgsize = decode_pagesize(slbe, pte1);
+				if (pgsize < 0)
+					continue;
+			}
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (second)
+			goto no_page_found;
+		v_val |= HPTE_V_SECONDARY;
+		second = true;
+		goto do_second;
+	}
+
+	r = be64_to_cpu(pteg[i+1]);
+	pp = (r & HPTE_R_PP) | key;
+	if (r & HPTE_R_PP0)
+		pp |= 8;
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+
+	eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1;
+	gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+	gpte->page_size = pgsize;
+	gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+	if (unlikely(vcpu->arch.disable_kernel_nx) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR))
+		gpte->may_execute = true;
+	gpte->may_read = false;
+	gpte->may_write = false;
+	gpte->wimg = r & HPTE_R_WIMG;
+
+	switch (pp) {
+	case 0:
+	case 1:
+	case 2:
+	case 6:
+		gpte->may_write = true;
+		fallthrough;
+	case 3:
+	case 5:
+	case 7:
+	case 10:
+		gpte->may_read = true;
+		break;
+	}
+
+	dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
+		"-> 0x%lx\n",
+		eaddr, avpn, gpte->vpage, gpte->raddr);
+
+	/* Update PTE R and C bits, so the guest's swapper knows we used the
+	 * page */
+	if (gpte->may_read && !(r & HPTE_R_R)) {
+		/*
+		 * Set the accessed flag.
+		 * We have to write this back with a single byte write
+		 * because another vcpu may be accessing this on
+		 * non-PAPR platforms such as mac99, and this is
+		 * what real hardware does.
+		 */
+                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64));
+		r |= HPTE_R_R;
+		put_user(r >> 8, addr + 6);
+	}
+	if (iswrite && gpte->may_write && !(r & HPTE_R_C)) {
+		/* Set the dirty flag */
+		/* Use a single byte write */
+                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64));
+		r |= HPTE_R_C;
+		put_user(r, addr + 7);
+	}
+
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+
+	if (!gpte->may_read || (iswrite && !gpte->may_write))
+		return -EPERM;
+	return 0;
+
+no_page_found:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	return -ENOENT;
+
+no_seg_found:
+	dprintk("KVM MMU: Trigger segment fault\n");
+	return -EINVAL;
+}
+
+static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
+{
+	u64 esid, esid_1t;
+	int slb_nr;
+	struct kvmppc_slb *slbe;
+
+	dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb);
+
+	esid = GET_ESID(rb);
+	esid_1t = GET_ESID_1T(rb);
+	slb_nr = rb & 0xfff;
+
+	if (slb_nr > vcpu->arch.slb_nr)
+		return;
+
+	slbe = &vcpu->arch.slb[slb_nr];
+
+	slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
+	slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
+	slbe->esid  = slbe->tb ? esid_1t : esid;
+	slbe->vsid  = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16);
+	slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
+	slbe->Ks    = (rs & SLB_VSID_KS) ? 1 : 0;
+	slbe->Kp    = (rs & SLB_VSID_KP) ? 1 : 0;
+	slbe->nx    = (rs & SLB_VSID_N) ? 1 : 0;
+	slbe->class = (rs & SLB_VSID_C) ? 1 : 0;
+
+	slbe->base_page_size = MMU_PAGE_4K;
+	if (slbe->large) {
+		if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) {
+			switch (rs & SLB_VSID_LP) {
+			case SLB_VSID_LP_00:
+				slbe->base_page_size = MMU_PAGE_16M;
+				break;
+			case SLB_VSID_LP_01:
+				slbe->base_page_size = MMU_PAGE_64K;
+				break;
+			}
+		} else
+			slbe->base_page_size = MMU_PAGE_16M;
+	}
+
+	slbe->orige = rb & (ESID_MASK | SLB_ESID_V);
+	slbe->origv = rs;
+
+	/* Map the new segment */
+	kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT);
+}
+
+static int kvmppc_mmu_book3s_64_slbfee(struct kvm_vcpu *vcpu, gva_t eaddr,
+				       ulong *ret_slb)
+{
+	struct kvmppc_slb *slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
+
+	if (slbe) {
+		*ret_slb = slbe->origv;
+		return 0;
+	}
+	*ret_slb = 0;
+	return -ENOENT;
+}
+
+static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
+{
+	struct kvmppc_slb *slbe;
+
+	if (slb_nr > vcpu->arch.slb_nr)
+		return 0;
+
+	slbe = &vcpu->arch.slb[slb_nr];
+
+	return slbe->orige;
+}
+
+static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
+{
+	struct kvmppc_slb *slbe;
+
+	if (slb_nr > vcpu->arch.slb_nr)
+		return 0;
+
+	slbe = &vcpu->arch.slb[slb_nr];
+
+	return slbe->origv;
+}
+
+static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
+{
+	struct kvmppc_slb *slbe;
+	u64 seg_size;
+
+	dprintk("KVM MMU: slbie(0x%llx)\n", ea);
+
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
+
+	if (!slbe)
+		return;
+
+	dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);
+
+	slbe->valid = false;
+	slbe->orige = 0;
+	slbe->origv = 0;
+
+	seg_size = 1ull << kvmppc_slb_sid_shift(slbe);
+	kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size);
+}
+
+static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	dprintk("KVM MMU: slbia()\n");
+
+	for (i = 1; i < vcpu->arch.slb_nr; i++) {
+		vcpu->arch.slb[i].valid = false;
+		vcpu->arch.slb[i].orige = 0;
+		vcpu->arch.slb[i].origv = 0;
+	}
+
+	if (kvmppc_get_msr(vcpu) & MSR_IR) {
+		kvmppc_mmu_flush_segments(vcpu);
+		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+	}
+}
+
+static void kvmppc_mmu_book3s_64_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
+					ulong value)
+{
+	u64 rb = 0, rs = 0;
+
+	/*
+	 * According to Book3 2.01 mtsrin is implemented as:
+	 *
+	 * The SLB entry specified by (RB)32:35 is loaded from register
+	 * RS, as follows.
+	 *
+	 * SLBE Bit	Source			SLB Field
+	 *
+	 * 0:31		0x0000_0000		ESID-0:31
+	 * 32:35	(RB)32:35		ESID-32:35
+	 * 36		0b1			V
+	 * 37:61	0x00_0000|| 0b0		VSID-0:24
+	 * 62:88	(RS)37:63		VSID-25:51
+	 * 89:91	(RS)33:35		Ks Kp N
+	 * 92		(RS)36			L ((RS)36 must be 0b0)
+	 * 93		0b0			C
+	 */
+
+	dprintk("KVM MMU: mtsrin(0x%x, 0x%lx)\n", srnum, value);
+
+	/* ESID = srnum */
+	rb |= (srnum & 0xf) << 28;
+	/* Set the valid bit */
+	rb |= 1 << 27;
+	/* Index = ESID */
+	rb |= srnum;
+
+	/* VSID = VSID */
+	rs |= (value & 0xfffffff) << 12;
+	/* flags = flags */
+	rs |= ((value >> 28) & 0x7) << 9;
+
+	kvmppc_mmu_book3s_64_slbmte(vcpu, rs, rb);
+}
+
+static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va,
+				       bool large)
+{
+	u64 mask = 0xFFFFFFFFFULL;
+	unsigned long i;
+	struct kvm_vcpu *v;
+
+	dprintk("KVM MMU: tlbie(0x%lx)\n", va);
+
+	/*
+	 * The tlbie instruction changed behaviour starting with
+	 * POWER6.  POWER6 and later don't have the large page flag
+	 * in the instruction but in the RB value, along with bits
+	 * indicating page and segment sizes.
+	 */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) {
+		/* POWER6 or later */
+		if (va & 1) {		/* L bit */
+			if ((va & 0xf000) == 0x1000)
+				mask = 0xFFFFFFFF0ULL;	/* 64k page */
+			else
+				mask = 0xFFFFFF000ULL;	/* 16M page */
+		}
+	} else {
+		/* older processors, e.g. PPC970 */
+		if (large)
+			mask = 0xFFFFFF000ULL;
+	}
+	/* flush this VA on all vcpus */
+	kvm_for_each_vcpu(i, v, vcpu->kvm)
+		kvmppc_mmu_pte_vflush(v, va >> 12, mask);
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid)
+{
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) &&
+		(mp_ea >> SID_SHIFT) == esid;
+}
+#endif
+
+static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+					     u64 *vsid)
+{
+	ulong ea = esid << SID_SHIFT;
+	struct kvmppc_slb *slb;
+	u64 gvsid = esid;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+	int pagesize = MMU_PAGE_64K;
+	u64 msr = kvmppc_get_msr(vcpu);
+
+	if (msr & (MSR_DR|MSR_IR)) {
+		slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
+		if (slb) {
+			gvsid = slb->vsid;
+			pagesize = slb->base_page_size;
+			if (slb->tb) {
+				gvsid <<= SID_SHIFT_1T - SID_SHIFT;
+				gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1);
+				gvsid |= VSID_1T;
+			}
+		}
+	}
+
+	switch (msr & (MSR_DR|MSR_IR)) {
+	case 0:
+		gvsid = VSID_REAL | esid;
+		break;
+	case MSR_IR:
+		gvsid |= VSID_REAL_IR;
+		break;
+	case MSR_DR:
+		gvsid |= VSID_REAL_DR;
+		break;
+	case MSR_DR|MSR_IR:
+		if (!slb)
+			goto no_slb;
+
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/*
+	 * Mark this as a 64k segment if the host is using
+	 * 64k pages, the host MMU supports 64k pages and
+	 * the guest segment page size is >= 64k,
+	 * but not if this segment contains the magic page.
+	 */
+	if (pagesize >= MMU_PAGE_64K &&
+	    mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    !segment_contains_magic_page(vcpu, esid))
+		gvsid |= VSID_64K;
+#endif
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	*vsid = gvsid;
+	return 0;
+
+no_slb:
+	/* Catch magic page case */
+	if (unlikely(mp_ea) &&
+	    unlikely(esid == (mp_ea >> SID_SHIFT)) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		*vsid = VSID_REAL | esid;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu)
+{
+	return (to_book3s(vcpu)->hid[5] & 0x80);
+}
+
+void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	mmu->mfsrin = NULL;
+	mmu->mtsrin = kvmppc_mmu_book3s_64_mtsrin;
+	mmu->slbmte = kvmppc_mmu_book3s_64_slbmte;
+	mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee;
+	mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev;
+	mmu->slbfee = kvmppc_mmu_book3s_64_slbfee;
+	mmu->slbie = kvmppc_mmu_book3s_64_slbie;
+	mmu->slbia = kvmppc_mmu_book3s_64_slbia;
+	mmu->xlate = kvmppc_mmu_book3s_64_xlate;
+	mmu->tlbie = kvmppc_mmu_book3s_64_tlbie;
+	mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid;
+	mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp;
+	mmu->is_dcbz32 = kvmppc_mmu_book3s_64_is_dcbz32;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
new file mode 100644
index 0000000000..bc6a381b53
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *     Kevin Wolf <mail@kevin-wolf.de>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/pkeys.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/book3s/64/mmu-hash.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+#include "trace_pr.h"
+#include "book3s.h"
+
+#define PTE_SIZE 12
+
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	mmu_hash_ops.hpte_invalidate(pte->slot, pte->host_vpn,
+				     pte->pagesize, pte->pagesize,
+				     MMU_SEGSIZE_256M, false);
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	u16 sid_map_mask;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+	if (map->valid && (map->guest_vsid == gvsid)) {
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
+		return map;
+	}
+
+	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+	if (map->valid && (map->guest_vsid == gvsid)) {
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
+		return map;
+	}
+
+	trace_kvm_book3s_slb_fail(sid_map_mask, gvsid);
+	return NULL;
+}
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
+			bool iswrite)
+{
+	unsigned long vpn;
+	kvm_pfn_t hpaddr;
+	ulong hash, hpteg;
+	u64 vsid;
+	int ret;
+	int rflags = 0x192;
+	int vflags = 0;
+	int attempt = 0;
+	struct kvmppc_sid_map *map;
+	int r = 0;
+	int hpsize = MMU_PAGE_4K;
+	bool writable;
+	unsigned long mmu_seq;
+	struct kvm *kvm = vcpu->kvm;
+	struct hpte_cache *cpte;
+	unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT;
+	unsigned long pfn;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	/* Get host physical address for gpa */
+	pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable);
+	if (is_error_noslot_pfn(pfn)) {
+		printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n",
+		       orig_pte->raddr);
+		r = -EINVAL;
+		goto out;
+	}
+	hpaddr = pfn << PAGE_SHIFT;
+
+	/* and write the mapping ea -> hpa into the pt */
+	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+	map = find_sid_vsid(vcpu, vsid);
+	if (!map) {
+		ret = kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr);
+		WARN_ON(ret < 0);
+		map = find_sid_vsid(vcpu, vsid);
+	}
+	if (!map) {
+		printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",
+				vsid, orig_pte->eaddr);
+		WARN_ON(true);
+		r = -EINVAL;
+		goto out;
+	}
+
+	vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M);
+
+	kvm_set_pfn_accessed(pfn);
+	if (!orig_pte->may_write || !writable)
+		rflags |= PP_RXRX;
+	else {
+		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_set_pfn_dirty(pfn);
+	}
+
+	if (!orig_pte->may_execute)
+		rflags |= HPTE_R_N;
+	else
+		kvmppc_mmu_flush_icache(pfn);
+
+	rflags |= pte_to_hpte_pkey_bits(0, HPTE_USE_KERNEL_KEY);
+	rflags = (rflags & ~HPTE_R_WIMG) | orig_pte->wimg;
+
+	/*
+	 * Use 64K pages if possible; otherwise, on 64K page kernels,
+	 * we need to transfer 4 more bits from guest real to host real addr.
+	 */
+	if (vsid & VSID_64K)
+		hpsize = MMU_PAGE_64K;
+	else
+		hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
+
+	hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M);
+
+	cpte = kvmppc_mmu_hpte_cache_next(vcpu);
+
+	spin_lock(&kvm->mmu_lock);
+	if (!cpte || mmu_invalidate_retry(kvm, mmu_seq)) {
+		r = -EAGAIN;
+		goto out_unlock;
+	}
+
+map_again:
+	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+	/* In case we tried normal mapping already, let's nuke old entries */
+	if (attempt > 1)
+		if (mmu_hash_ops.hpte_remove(hpteg) < 0) {
+			r = -1;
+			goto out_unlock;
+		}
+
+	ret = mmu_hash_ops.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
+				       hpsize, hpsize, MMU_SEGSIZE_256M);
+
+	if (ret == -1) {
+		/* If we couldn't map a primary PTE, try a secondary */
+		hash = ~hash;
+		vflags ^= HPTE_V_SECONDARY;
+		attempt++;
+		goto map_again;
+	} else if (ret < 0) {
+		r = -EIO;
+		goto out_unlock;
+	} else {
+		trace_kvm_book3s_64_mmu_map(rflags, hpteg,
+					    vpn, hpaddr, orig_pte);
+
+		/*
+		 * The mmu_hash_ops code may give us a secondary entry even
+		 * though we asked for a primary. Fix up.
+		 */
+		if ((ret & _PTEIDX_SECONDARY) && !(vflags & HPTE_V_SECONDARY)) {
+			hash = ~hash;
+			hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+		}
+
+		cpte->slot = hpteg + (ret & 7);
+		cpte->host_vpn = vpn;
+		cpte->pte = *orig_pte;
+		cpte->pfn = pfn;
+		cpte->pagesize = hpsize;
+
+		kvmppc_mmu_hpte_cache_map(vcpu, cpte);
+		cpte = NULL;
+	}
+
+out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	if (cpte)
+		kvmppc_mmu_hpte_cache_free(cpte);
+
+out:
+	return r;
+}
+
+void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	u64 mask = 0xfffffffffULL;
+	u64 vsid;
+
+	vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid);
+	if (vsid & VSID_64K)
+		mask = 0xffffffff0ULL;
+	kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask);
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	unsigned long vsid_bits = VSID_BITS_65_256M;
+	struct kvmppc_sid_map *map;
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u16 sid_map_mask;
+	static int backwards_map;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	/* We might get collisions that trap in preceding order, so let's
+	   map them differently */
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	if (backwards_map)
+		sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+	/* Make sure we're taking the other map next time */
+	backwards_map = !backwards_map;
+
+	/* Uh-oh ... out of mappings. Let's flush! */
+	if (vcpu_book3s->proto_vsid_next == vcpu_book3s->proto_vsid_max) {
+		vcpu_book3s->proto_vsid_next = vcpu_book3s->proto_vsid_first;
+		memset(vcpu_book3s->sid_map, 0,
+		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+	}
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		vsid_bits = VSID_BITS_256M;
+
+	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++,
+				       VSID_MULTIPLIER_256M, vsid_bits);
+
+	map->guest_vsid = gvsid;
+	map->valid = true;
+
+	trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid);
+
+	return map;
+}
+
+static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	int i;
+	int max_slb_size = 64;
+	int found_inval = -1;
+	int r;
+
+	/* Are we overwriting? */
+	for (i = 0; i < svcpu->slb_max; i++) {
+		if (!(svcpu->slb[i].esid & SLB_ESID_V))
+			found_inval = i;
+		else if ((svcpu->slb[i].esid & ESID_MASK) == esid) {
+			r = i;
+			goto out;
+		}
+	}
+
+	/* Found a spare entry that was invalidated before */
+	if (found_inval >= 0) {
+		r = found_inval;
+		goto out;
+	}
+
+	/* No spare invalid entry, so create one */
+
+	if (mmu_slb_size < 64)
+		max_slb_size = mmu_slb_size;
+
+	/* Overflowing -> purge */
+	if ((svcpu->slb_max) == max_slb_size)
+		kvmppc_mmu_flush_segments(vcpu);
+
+	r = svcpu->slb_max;
+	svcpu->slb_max++;
+
+out:
+	svcpu_put(svcpu);
+	return r;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	u64 esid = eaddr >> SID_SHIFT;
+	u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
+	u64 slb_vsid = SLB_VSID_USER;
+	u64 gvsid;
+	int slb_index;
+	struct kvmppc_sid_map *map;
+	int r = 0;
+
+	slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
+
+	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+		/* Invalidate an entry */
+		svcpu->slb[slb_index].esid = 0;
+		r = -ENOENT;
+		goto out;
+	}
+
+	map = find_sid_vsid(vcpu, gvsid);
+	if (!map)
+		map = create_sid_map(vcpu, gvsid);
+
+	map->guest_esid = esid;
+
+	slb_vsid |= (map->host_vsid << 12);
+	slb_vsid &= ~SLB_VSID_KP;
+	slb_esid |= slb_index;
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/* Set host segment base page size to 64K if possible */
+	if (gvsid & VSID_64K)
+		slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp;
+#endif
+
+	svcpu->slb[slb_index].esid = slb_esid;
+	svcpu->slb[slb_index].vsid = slb_vsid;
+
+	trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
+
+out:
+	svcpu_put(svcpu);
+	return r;
+}
+
+void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong seg_mask = -seg_size;
+	int i;
+
+	for (i = 0; i < svcpu->slb_max; i++) {
+		if ((svcpu->slb[i].esid & SLB_ESID_V) &&
+		    (svcpu->slb[i].esid & seg_mask) == ea) {
+			/* Invalidate this entry */
+			svcpu->slb[i].esid = 0;
+		}
+	}
+
+	svcpu_put(svcpu);
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->slb_max = 0;
+	svcpu->slb[0].esid = 0;
+	svcpu_put(svcpu);
+}
+
+void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_hpte_destroy(vcpu);
+	__destroy_context(to_book3s(vcpu)->context_id[0]);
+}
+
+int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int err;
+
+	err = hash__alloc_context_id();
+	if (err < 0)
+		return -1;
+	vcpu3s->context_id[0] = err;
+
+	vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1)
+				  << ESID_BITS) - 1;
+	vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS;
+	vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first;
+
+	kvmppc_mmu_hpte_init(vcpu);
+
+	return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 0000000000..fdfc2a62dd
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,2150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+#include <linux/srcu.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/book3s/64/mmu-hash.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/pte-walk.h>
+
+#include "book3s.h"
+#include "book3s_hv.h"
+#include "trace_hv.h"
+
+//#define DEBUG_RESIZE_HPT	1
+
+#ifdef DEBUG_RESIZE_HPT
+#define resize_hpt_debug(resize, ...)				\
+	do {							\
+		printk(KERN_DEBUG "RESIZE HPT %p: ", resize);	\
+		printk(__VA_ARGS__);				\
+	} while (0)
+#else
+#define resize_hpt_debug(resize, ...)				\
+	do { } while (0)
+#endif
+
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+				long pte_index, unsigned long pteh,
+				unsigned long ptel, unsigned long *pte_idx_ret);
+
+struct kvm_resize_hpt {
+	/* These fields read-only after init */
+	struct kvm *kvm;
+	struct work_struct work;
+	u32 order;
+
+	/* These fields protected by kvm->arch.mmu_setup_lock */
+
+	/* Possible values and their usage:
+	 *  <0     an error occurred during allocation,
+	 *  -EBUSY allocation is in the progress,
+	 *  0      allocation made successfully.
+	 */
+	int error;
+
+	/* Private to the work thread, until error != -EBUSY,
+	 * then protected by kvm->arch.mmu_setup_lock.
+	 */
+	struct kvm_hpt_info hpt;
+};
+
+int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
+{
+	unsigned long hpt = 0;
+	int cma = 0;
+	struct page *page = NULL;
+	struct revmap_entry *rev;
+	unsigned long npte;
+
+	if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
+		return -EINVAL;
+
+	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
+	if (page) {
+		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+		memset((void *)hpt, 0, (1ul << order));
+		cma = 1;
+	}
+
+	if (!hpt)
+		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL
+				       |__GFP_NOWARN, order - PAGE_SHIFT);
+
+	if (!hpt)
+		return -ENOMEM;
+
+	/* HPTEs are 2**4 bytes long */
+	npte = 1ul << (order - 4);
+
+	/* Allocate reverse map array */
+	rev = vmalloc(array_size(npte, sizeof(struct revmap_entry)));
+	if (!rev) {
+		if (cma)
+			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
+		else
+			free_pages(hpt, order - PAGE_SHIFT);
+		return -ENOMEM;
+	}
+
+	info->order = order;
+	info->virt = hpt;
+	info->cma = cma;
+	info->rev = rev;
+
+	return 0;
+}
+
+void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
+{
+	atomic64_set(&kvm->arch.mmio_update, 0);
+	kvm->arch.hpt = *info;
+	kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
+
+	pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
+		 info->virt, (long)info->order, kvm->arch.lpid);
+}
+
+int kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
+{
+	int err = -EBUSY;
+	struct kvm_hpt_info info;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+	if (kvm->arch.mmu_ready) {
+		kvm->arch.mmu_ready = 0;
+		/* order mmu_ready vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.mmu_ready = 1;
+			goto out;
+		}
+	}
+	if (kvm_is_radix(kvm)) {
+		err = kvmppc_switch_mmu_to_hpt(kvm);
+		if (err)
+			goto out;
+	}
+
+	if (kvm->arch.hpt.order == order) {
+		/* We already have a suitable HPT */
+
+		/* Set the entire HPT to 0, i.e. invalid HPTEs */
+		memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
+		/*
+		 * Reset all the reverse-mapping chains for all memslots
+		 */
+		kvmppc_rmap_reset(kvm);
+		err = 0;
+		goto out;
+	}
+
+	if (kvm->arch.hpt.virt) {
+		kvmppc_free_hpt(&kvm->arch.hpt);
+		kvmppc_rmap_reset(kvm);
+	}
+
+	err = kvmppc_allocate_hpt(&info, order);
+	if (err < 0)
+		goto out;
+	kvmppc_set_hpt(kvm, &info);
+
+out:
+	if (err == 0)
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+	return err;
+}
+
+void kvmppc_free_hpt(struct kvm_hpt_info *info)
+{
+	vfree(info->rev);
+	info->rev = NULL;
+	if (info->cma)
+		kvm_free_hpt_cma(virt_to_page((void *)info->virt),
+				 1 << (info->order - PAGE_SHIFT));
+	else if (info->virt)
+		free_pages(info->virt, info->order - PAGE_SHIFT);
+	info->virt = 0;
+	info->order = 0;
+}
+
+/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
+}
+
+/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize == 0x10000) ? 0x1000 : 0;
+}
+
+void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+		     unsigned long porder)
+{
+	unsigned long i;
+	unsigned long npages;
+	unsigned long hp_v, hp_r;
+	unsigned long addr, hash;
+	unsigned long psize;
+	unsigned long hp0, hp1;
+	unsigned long idx_ret;
+	long ret;
+	struct kvm *kvm = vcpu->kvm;
+
+	psize = 1ul << porder;
+	npages = memslot->npages >> (porder - PAGE_SHIFT);
+
+	/* VRMA can't be > 1TB */
+	if (npages > 1ul << (40 - porder))
+		npages = 1ul << (40 - porder);
+	/* Can't use more than 1 HPTE per HPTEG */
+	if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
+		npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
+
+	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
+	hp1 = hpte1_pgsize_encoding(psize) |
+		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
+
+	for (i = 0; i < npages; ++i) {
+		addr = i << porder;
+		/* can't use hpt_hash since va > 64 bits */
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
+			& kvmppc_hpt_mask(&kvm->arch.hpt);
+		/*
+		 * We assume that the hash table is empty and no
+		 * vcpus are using it at this stage.  Since we create
+		 * at most one HPTE per HPTEG, we just assume entry 7
+		 * is available and use it.
+		 */
+		hash = (hash << 3) + 7;
+		hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
+		hp_r = hp1 | addr;
+		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
+						 &idx_ret);
+		if (ret != H_SUCCESS) {
+			pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
+			       addr, ret);
+			break;
+		}
+	}
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+	unsigned long nr_lpids;
+
+	if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
+		return -EINVAL;
+
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (WARN_ON(mfspr(SPRN_LPID) != 0))
+			return -EINVAL;
+		nr_lpids = 1UL << mmu_lpid_bits;
+	} else {
+		nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT;
+	}
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */
+		if (cpu_has_feature(CPU_FTR_ARCH_207S))
+			WARN_ON(nr_lpids != 1UL << 12);
+		else
+			WARN_ON(nr_lpids != 1UL << 10);
+
+		/*
+		 * Reserve the last implemented LPID use in partition
+		 * switching for POWER7 and POWER8.
+		 */
+		nr_lpids -= 1;
+	}
+
+	kvmppc_init_lpid(nr_lpids);
+
+	return 0;
+}
+
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+				long pte_index, unsigned long pteh,
+				unsigned long ptel, unsigned long *pte_idx_ret)
+{
+	long ret;
+
+	preempt_disable();
+	ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
+				kvm->mm->pgd, false, pte_idx_ret);
+	preempt_enable();
+	if (ret == H_TOO_HARD) {
+		/* this can't happen */
+		pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
+		ret = H_RESOURCE;	/* or something */
+	}
+	return ret;
+
+}
+
+static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
+							 gva_t eaddr)
+{
+	u64 mask;
+	int i;
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
+			continue;
+
+		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
+			mask = ESID_MASK_1T;
+		else
+			mask = ESID_MASK;
+
+		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
+			return &vcpu->arch.slb[i];
+	}
+	return NULL;
+}
+
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
+			unsigned long ea)
+{
+	unsigned long ra_mask;
+
+	ra_mask = kvmppc_actual_pgsz(v, r) - 1;
+	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned long slb_v;
+	unsigned long pp, key;
+	unsigned long v, orig_v, gr;
+	__be64 *hptep;
+	long int index;
+	int virtmode = __kvmppc_get_msr_hv(vcpu) & (data ? MSR_DR : MSR_IR);
+
+	if (kvm_is_radix(vcpu->kvm))
+		return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
+
+	/* Get SLB entry */
+	if (virtmode) {
+		slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
+		if (!slbe)
+			return -EINVAL;
+		slb_v = slbe->origv;
+	} else {
+		/* real mode access */
+		slb_v = vcpu->kvm->arch.vrma_slb_v;
+	}
+
+	preempt_disable();
+	/* Find the HPTE in the hash table */
+	index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
+					 HPTE_V_VALID | HPTE_V_ABSENT);
+	if (index < 0) {
+		preempt_enable();
+		return -ENOENT;
+	}
+	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+	v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
+	gr = kvm->arch.hpt.rev[index].guest_rpte;
+
+	unlock_hpte(hptep, orig_v);
+	preempt_enable();
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
+
+	/* Get PP bits and key for permission check */
+	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
+	key = (__kvmppc_get_msr_hv(vcpu) & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
+	key &= slb_v;
+
+	/* Calculate permissions */
+	gpte->may_read = hpte_read_permission(pp, key);
+	gpte->may_write = hpte_write_permission(pp, key);
+	gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
+
+	/* Storage key permission check for POWER7 */
+	if (data && virtmode) {
+		int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
+		if (amrfield & 1)
+			gpte->may_read = 0;
+		if (amrfield & 2)
+			gpte->may_write = 0;
+	}
+
+	/* Get the guest physical address */
+	gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
+	return 0;
+}
+
+/*
+ * Quick test for whether an instruction is a load or a store.
+ * If the instruction is a load or a store, then this will indicate
+ * which it is, at least on server processors.  (Embedded processors
+ * have some external PID instructions that don't follow the rule
+ * embodied here.)  If the instruction isn't a load or store, then
+ * this doesn't return anything useful.
+ */
+static int instruction_is_store(ppc_inst_t instr)
+{
+	unsigned int mask;
+	unsigned int suffix;
+
+	mask = 0x10000000;
+	suffix = ppc_inst_val(instr);
+	if (ppc_inst_prefixed(instr))
+		suffix = ppc_inst_suffix(instr);
+	else if ((suffix & 0xfc000000) == 0x7c000000)
+		mask = 0x100;		/* major opcode 31 */
+	return (suffix & mask) != 0;
+}
+
+int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu,
+			   unsigned long gpa, gva_t ea, int is_store)
+{
+	ppc_inst_t last_inst;
+	bool is_prefixed = !!(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
+
+	/*
+	 * Fast path - check if the guest physical address corresponds to a
+	 * device on the FAST_MMIO_BUS, if so we can avoid loading the
+	 * instruction all together, then we can just handle it and return.
+	 */
+	if (is_store) {
+		int idx, ret;
+
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0,
+				       NULL);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		if (!ret) {
+			kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + (is_prefixed ? 8 : 4));
+			return RESUME_GUEST;
+		}
+	}
+
+	/*
+	 * If we fail, we just return to the guest and try executing it again.
+	 */
+	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
+		EMULATE_DONE)
+		return RESUME_GUEST;
+
+	/*
+	 * WARNING: We do not know for sure whether the instruction we just
+	 * read from memory is the same that caused the fault in the first
+	 * place.
+	 *
+	 * If the fault is prefixed but the instruction is not or vice
+	 * versa, try again so that we don't advance pc the wrong amount.
+	 */
+	if (ppc_inst_prefixed(last_inst) != is_prefixed)
+		return RESUME_GUEST;
+
+	/*
+	 * If the instruction we read is neither an load or a store,
+	 * then it can't access memory, so we don't need to worry about
+	 * enforcing access permissions.  So, assuming it is a load or
+	 * store, we just check that its direction (load or store) is
+	 * consistent with the original fault, since that's what we
+	 * checked the access permissions against.  If there is a mismatch
+	 * we just return and retry the instruction.
+	 */
+
+	if (instruction_is_store(last_inst) != !!is_store)
+		return RESUME_GUEST;
+
+	/*
+	 * Emulated accesses are emulated by looking at the hash for
+	 * translation once, then performing the access later. The
+	 * translation could be invalidated in the meantime in which
+	 * point performing the subsequent memory access on the old
+	 * physical address could possibly be a security hole for the
+	 * guest (but not the host).
+	 *
+	 * This is less of an issue for MMIO stores since they aren't
+	 * globally visible. It could be an issue for MMIO loads to
+	 * a certain extent but we'll ignore it for now.
+	 */
+
+	vcpu->arch.paddr_accessed = gpa;
+	vcpu->arch.vaddr_accessed = ea;
+	return kvmppc_emulate_mmio(vcpu);
+}
+
+int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
+				unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long hpte[3], r;
+	unsigned long hnow_v, hnow_r;
+	__be64 *hptep;
+	unsigned long mmu_seq, psize, pte_size;
+	unsigned long gpa_base, gfn_base;
+	unsigned long gpa, gfn, hva, pfn, hpa;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	struct revmap_entry *rev;
+	struct page *page;
+	long index, ret;
+	bool is_ci;
+	bool writing, write_ok;
+	unsigned int shift;
+	unsigned long rcbits;
+	long mmio_update;
+	pte_t pte, *ptep;
+
+	if (kvm_is_radix(kvm))
+		return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr);
+
+	/*
+	 * Real-mode code has already searched the HPT and found the
+	 * entry we're interested in.  Lock the entry and check that
+	 * it hasn't changed.  If it has, just return and re-execute the
+	 * instruction.
+	 */
+	if (ea != vcpu->arch.pgfault_addr)
+		return RESUME_GUEST;
+
+	if (vcpu->arch.pgfault_cache) {
+		mmio_update = atomic64_read(&kvm->arch.mmio_update);
+		if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
+			r = vcpu->arch.pgfault_cache->rpte;
+			psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
+						   r);
+			gpa_base = r & HPTE_R_RPN & ~(psize - 1);
+			gfn_base = gpa_base >> PAGE_SHIFT;
+			gpa = gpa_base | (ea & (psize - 1));
+			return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
+						dsisr & DSISR_ISSTORE);
+		}
+	}
+	index = vcpu->arch.pgfault_index;
+	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+	rev = &kvm->arch.hpt.rev[index];
+	preempt_disable();
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+	hpte[1] = be64_to_cpu(hptep[1]);
+	hpte[2] = r = rev->guest_rpte;
+	unlock_hpte(hptep, hpte[0]);
+	preempt_enable();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
+		hpte[1] = hpte_new_to_old_r(hpte[1]);
+	}
+	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
+	    hpte[1] != vcpu->arch.pgfault_hpte[1])
+		return RESUME_GUEST;
+
+	/* Translate the logical address and get the page */
+	psize = kvmppc_actual_pgsz(hpte[0], r);
+	gpa_base = r & HPTE_R_RPN & ~(psize - 1);
+	gfn_base = gpa_base >> PAGE_SHIFT;
+	gpa = gpa_base | (ea & (psize - 1));
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
+					      dsisr & DSISR_ISSTORE);
+
+	/*
+	 * This should never happen, because of the slot_is_aligned()
+	 * check in kvmppc_do_h_enter().
+	 */
+	if (gfn_base < memslot->base_gfn)
+		return -EFAULT;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	ret = -EFAULT;
+	page = NULL;
+	writing = (dsisr & DSISR_ISSTORE) != 0;
+	/* If writing != 0, then the HPTE must allow writing, if we get here */
+	write_ok = writing;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+
+	/*
+	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
+	 * do it with !atomic && !async, which is how we call it.
+	 * We always ask for write permission since the common case
+	 * is that the page is writable.
+	 */
+	if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
+		write_ok = true;
+	} else {
+		/* Call KVM generic code to do the slow-path check */
+		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
+					   writing, &write_ok, NULL);
+		if (is_error_noslot_pfn(pfn))
+			return -EFAULT;
+		page = NULL;
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (PageReserved(page))
+				page = NULL;
+		}
+	}
+
+	/*
+	 * Read the PTE from the process' radix tree and use that
+	 * so we get the shift and attribute bits.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
+	pte = __pte(0);
+	if (ptep)
+		pte = READ_ONCE(*ptep);
+	spin_unlock(&kvm->mmu_lock);
+	/*
+	 * If the PTE disappeared temporarily due to a THP
+	 * collapse, just return and let the guest try again.
+	 */
+	if (!pte_present(pte)) {
+		if (page)
+			put_page(page);
+		return RESUME_GUEST;
+	}
+	hpa = pte_pfn(pte) << PAGE_SHIFT;
+	pte_size = PAGE_SIZE;
+	if (shift)
+		pte_size = 1ul << shift;
+	is_ci = pte_ci(pte);
+
+	if (psize > pte_size)
+		goto out_put;
+	if (pte_size > psize)
+		hpa |= hva & (pte_size - psize);
+
+	/* Check WIMG vs. the actual page we're accessing */
+	if (!hpte_cache_flags_ok(r, is_ci)) {
+		if (is_ci)
+			goto out_put;
+		/*
+		 * Allow guest to map emulated device memory as
+		 * uncacheable, but actually make it cacheable.
+		 */
+		r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
+	}
+
+	/*
+	 * Set the HPTE to point to hpa.
+	 * Since the hpa is at PAGE_SIZE granularity, make sure we
+	 * don't mask out lower-order bits if psize < PAGE_SIZE.
+	 */
+	if (psize < PAGE_SIZE)
+		psize = PAGE_SIZE;
+	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa;
+	if (hpte_is_writable(r) && !write_ok)
+		r = hpte_make_readonly(r);
+	ret = RESUME_GUEST;
+	preempt_disable();
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	hnow_v = be64_to_cpu(hptep[0]);
+	hnow_r = be64_to_cpu(hptep[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
+		hnow_r = hpte_new_to_old_r(hnow_r);
+	}
+
+	/*
+	 * If the HPT is being resized, don't update the HPTE,
+	 * instead let the guest retry after the resize operation is complete.
+	 * The synchronization for mmu_ready test vs. set is provided
+	 * by the HPTE lock.
+	 */
+	if (!kvm->arch.mmu_ready)
+		goto out_unlock;
+
+	if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
+	    rev->guest_rpte != hpte[2])
+		/* HPTE has been changed under us; let the guest retry */
+		goto out_unlock;
+	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+
+	/* Always put the HPTE in the rmap chain for the page base address */
+	rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
+	lock_rmap(rmap);
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	ret = RESUME_GUEST;
+	if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) {
+		unlock_rmap(rmap);
+		goto out_unlock;
+	}
+
+	/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
+	rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+	r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+
+	if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
+		/* HPTE was previously valid, so we need to invalidate it */
+		unlock_rmap(rmap);
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, index);
+		/* don't lose previous R and C bits */
+		r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+	} else {
+		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		r = hpte_old_to_new_r(hpte[0], r);
+		hpte[0] = hpte_old_to_new_v(hpte[0]);
+	}
+	hptep[1] = cpu_to_be64(r);
+	eieio();
+	__unlock_hpte(hptep, hpte[0]);
+	asm volatile("ptesync" : : : "memory");
+	preempt_enable();
+	if (page && hpte_is_writable(r))
+		set_page_dirty_lock(page);
+
+ out_put:
+	trace_kvm_page_fault_exit(vcpu, hpte, ret);
+
+	if (page)
+		put_page(page);
+	return ret;
+
+ out_unlock:
+	__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+	preempt_enable();
+	goto out_put;
+}
+
+void kvmppc_rmap_reset(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx, bkt;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, bkt, slots) {
+		/* Mutual exclusion with kvm_unmap_hva_range etc. */
+		spin_lock(&kvm->mmu_lock);
+		/*
+		 * This assumes it is acceptable to lose reference and
+		 * change bits across a reset.
+		 */
+		memset(memslot->arch.rmap, 0,
+		       memslot->npages * sizeof(*memslot->arch.rmap));
+		spin_unlock(&kvm->mmu_lock);
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/* Must be called with both HPTE and rmap locked */
+static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+			      struct kvm_memory_slot *memslot,
+			      unsigned long *rmapp, unsigned long gfn)
+{
+	__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
+	unsigned long j, h;
+	unsigned long ptel, psize, rcbits;
+
+	j = rev[i].forw;
+	if (j == i) {
+		/* chain is now empty */
+		*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+	} else {
+		/* remove i from chain */
+		h = rev[i].back;
+		rev[h].forw = j;
+		rev[j].back = h;
+		rev[i].forw = rev[i].back = i;
+		*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+	}
+
+	/* Now check and modify the HPTE */
+	ptel = rev[i].guest_rpte;
+	psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
+	if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+	    hpte_rpn(ptel, psize) == gfn) {
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, i);
+		hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+		/* Harvest R and C */
+		rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+		if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
+			kvmppc_update_dirty_map(memslot, gfn, psize);
+		if (rcbits & ~rev[i].guest_rpte) {
+			rev[i].guest_rpte = ptel | rcbits;
+			note_hpte_modification(kvm, &rev[i]);
+		}
+	}
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			    unsigned long gfn)
+{
+	unsigned long i;
+	__be64 *hptep;
+	unsigned long *rmapp;
+
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+	for (;;) {
+		lock_rmap(rmapp);
+		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+			unlock_rmap(rmapp);
+			break;
+		}
+
+		/*
+		 * To avoid an ABBA deadlock with the HPTE lock bit,
+		 * we can't spin on the HPTE lock while holding the
+		 * rmap chain lock.
+		 */
+		i = *rmapp & KVMPPC_RMAP_INDEX;
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
+				cpu_relax();
+			continue;
+		}
+
+		kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
+		unlock_rmap(rmapp);
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+	}
+}
+
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	gfn_t gfn;
+
+	if (kvm_is_radix(kvm)) {
+		for (gfn = range->start; gfn < range->end; gfn++)
+			kvm_unmap_radix(kvm, range->slot, gfn);
+	} else {
+		for (gfn = range->start; gfn < range->end; gfn++)
+			kvm_unmap_rmapp(kvm, range->slot, gfn);
+	}
+
+	return false;
+}
+
+void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+				  struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn;
+	unsigned long n;
+	unsigned long *rmapp;
+
+	gfn = memslot->base_gfn;
+	rmapp = memslot->arch.rmap;
+	if (kvm_is_radix(kvm)) {
+		kvmppc_radix_flush_memslot(kvm, memslot);
+		return;
+	}
+
+	for (n = memslot->npages; n; --n, ++gfn) {
+		/*
+		 * Testing the present bit without locking is OK because
+		 * the memslot has been marked invalid already, and hence
+		 * no new HPTEs referencing this page can be created,
+		 * thus the present bit can't go from 0 to 1.
+		 */
+		if (*rmapp & KVMPPC_RMAP_PRESENT)
+			kvm_unmap_rmapp(kvm, memslot, gfn);
+		++rmapp;
+	}
+}
+
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			  unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
+	unsigned long head, i, j;
+	__be64 *hptep;
+	bool ret = false;
+	unsigned long *rmapp;
+
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ retry:
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
+		*rmapp &= ~KVMPPC_RMAP_REFERENCED;
+		ret = true;
+	}
+	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+		unlock_rmap(rmapp);
+		return ret;
+	}
+
+	i = head = *rmapp & KVMPPC_RMAP_INDEX;
+	do {
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+		j = rev[i].forw;
+
+		/* If this HPTE isn't referenced, ignore it */
+		if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
+			continue;
+
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
+				cpu_relax();
+			goto retry;
+		}
+
+		/* Now check and modify the HPTE */
+		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+		    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
+			kvmppc_clear_ref_hpte(kvm, hptep, i);
+			if (!(rev[i].guest_rpte & HPTE_R_R)) {
+				rev[i].guest_rpte |= HPTE_R_R;
+				note_hpte_modification(kvm, &rev[i]);
+			}
+			ret = true;
+		}
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+	} while ((i = j) != head);
+
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	gfn_t gfn;
+	bool ret = false;
+
+	if (kvm_is_radix(kvm)) {
+		for (gfn = range->start; gfn < range->end; gfn++)
+			ret |= kvm_age_radix(kvm, range->slot, gfn);
+	} else {
+		for (gfn = range->start; gfn < range->end; gfn++)
+			ret |= kvm_age_rmapp(kvm, range->slot, gfn);
+	}
+
+	return ret;
+}
+
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			       unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
+	unsigned long head, i, j;
+	unsigned long *hp;
+	bool ret = true;
+	unsigned long *rmapp;
+
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+	if (*rmapp & KVMPPC_RMAP_REFERENCED)
+		return true;
+
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_REFERENCED)
+		goto out;
+
+	if (*rmapp & KVMPPC_RMAP_PRESENT) {
+		i = head = *rmapp & KVMPPC_RMAP_INDEX;
+		do {
+			hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
+			j = rev[i].forw;
+			if (be64_to_cpu(hp[1]) & HPTE_R_R)
+				goto out;
+		} while ((i = j) != head);
+	}
+	ret = false;
+
+ out:
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	WARN_ON(range->start + 1 != range->end);
+
+	if (kvm_is_radix(kvm))
+		return kvm_test_age_radix(kvm, range->slot, range->start);
+	else
+		return kvm_test_age_rmapp(kvm, range->slot, range->start);
+}
+
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	WARN_ON(range->start + 1 != range->end);
+
+	if (kvm_is_radix(kvm))
+		kvm_unmap_radix(kvm, range->slot, range->start);
+	else
+		kvm_unmap_rmapp(kvm, range->slot, range->start);
+
+	return false;
+}
+
+static int vcpus_running(struct kvm *kvm)
+{
+	return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
+/*
+ * Returns the number of system pages that are dirty.
+ * This can be more than 1 if we find a huge-page HPTE.
+ */
+static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
+{
+	struct revmap_entry *rev = kvm->arch.hpt.rev;
+	unsigned long head, i, j;
+	unsigned long n;
+	unsigned long v, r;
+	__be64 *hptep;
+	int npages_dirty = 0;
+
+ retry:
+	lock_rmap(rmapp);
+	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+		unlock_rmap(rmapp);
+		return npages_dirty;
+	}
+
+	i = head = *rmapp & KVMPPC_RMAP_INDEX;
+	do {
+		unsigned long hptep1;
+		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+		j = rev[i].forw;
+
+		/*
+		 * Checking the C (changed) bit here is racy since there
+		 * is no guarantee about when the hardware writes it back.
+		 * If the HPTE is not writable then it is stable since the
+		 * page can't be written to, and we would have done a tlbie
+		 * (which forces the hardware to complete any writeback)
+		 * when making the HPTE read-only.
+		 * If vcpus are running then this call is racy anyway
+		 * since the page could get dirtied subsequently, so we
+		 * expect there to be a further call which would pick up
+		 * any delayed C bit writeback.
+		 * Otherwise we need to do the tlbie even if C==0 in
+		 * order to pick up any delayed writeback of C.
+		 */
+		hptep1 = be64_to_cpu(hptep[1]);
+		if (!(hptep1 & HPTE_R_C) &&
+		    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
+			continue;
+
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
+				cpu_relax();
+			goto retry;
+		}
+
+		/* Now check and modify the HPTE */
+		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
+			__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+			continue;
+		}
+
+		/* need to make it temporarily absent so C is stable */
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, i);
+		v = be64_to_cpu(hptep[0]);
+		r = be64_to_cpu(hptep[1]);
+		if (r & HPTE_R_C) {
+			hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
+			if (!(rev[i].guest_rpte & HPTE_R_C)) {
+				rev[i].guest_rpte |= HPTE_R_C;
+				note_hpte_modification(kvm, &rev[i]);
+			}
+			n = kvmppc_actual_pgsz(v, r);
+			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
+			if (n > npages_dirty)
+				npages_dirty = n;
+			eieio();
+		}
+		v &= ~HPTE_V_ABSENT;
+		v |= HPTE_V_VALID;
+		__unlock_hpte(hptep, v);
+	} while ((i = j) != head);
+
+	unlock_rmap(rmapp);
+	return npages_dirty;
+}
+
+void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+			      struct kvm_memory_slot *memslot,
+			      unsigned long *map)
+{
+	unsigned long gfn;
+
+	if (!vpa->dirty || !vpa->pinned_addr)
+		return;
+	gfn = vpa->gpa >> PAGE_SHIFT;
+	if (gfn < memslot->base_gfn ||
+	    gfn >= memslot->base_gfn + memslot->npages)
+		return;
+
+	vpa->dirty = false;
+	if (map)
+		__set_bit_le(gfn - memslot->base_gfn, map);
+}
+
+long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
+{
+	unsigned long i;
+	unsigned long *rmapp;
+
+	preempt_disable();
+	rmapp = memslot->arch.rmap;
+	for (i = 0; i < memslot->npages; ++i) {
+		int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
+		/*
+		 * Note that if npages > 0 then i must be a multiple of npages,
+		 * since we always put huge-page HPTEs in the rmap chain
+		 * corresponding to their page base address.
+		 */
+		if (npages)
+			set_dirty_bits(map, i, npages);
+		++rmapp;
+	}
+	preempt_enable();
+	return 0;
+}
+
+void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
+			    unsigned long *nb_ret)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	struct page *page, *pages[1];
+	int npages;
+	unsigned long hva, offset;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		goto err;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
+	if (npages < 1)
+		goto err;
+	page = pages[0];
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	offset = gpa & (PAGE_SIZE - 1);
+	if (nb_ret)
+		*nb_ret = PAGE_SIZE - offset;
+	return page_address(page) + offset;
+
+ err:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return NULL;
+}
+
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
+			     bool dirty)
+{
+	struct page *page = virt_to_page(va);
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
+	int srcu_idx;
+
+	put_page(page);
+
+	if (!dirty)
+		return;
+
+	/* We need to mark this page dirty in the memslot dirty_bitmap, if any */
+	gfn = gpa >> PAGE_SHIFT;
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (memslot && memslot->dirty_bitmap)
+		set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/*
+ * HPT resizing
+ */
+static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
+{
+	int rc;
+
+	rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
+	if (rc < 0)
+		return rc;
+
+	resize_hpt_debug(resize, "%s(): HPT @ 0x%lx\n", __func__,
+			 resize->hpt.virt);
+
+	return 0;
+}
+
+static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
+					    unsigned long idx)
+{
+	struct kvm *kvm = resize->kvm;
+	struct kvm_hpt_info *old = &kvm->arch.hpt;
+	struct kvm_hpt_info *new = &resize->hpt;
+	unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
+	unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
+	__be64 *hptep, *new_hptep;
+	unsigned long vpte, rpte, guest_rpte;
+	int ret;
+	struct revmap_entry *rev;
+	unsigned long apsize, avpn, pteg, hash;
+	unsigned long new_idx, new_pteg, replace_vpte;
+	int pshift;
+
+	hptep = (__be64 *)(old->virt + (idx << 4));
+
+	/* Guest is stopped, so new HPTEs can't be added or faulted
+	 * in, only unmapped or altered by host actions.  So, it's
+	 * safe to check this before we take the HPTE lock */
+	vpte = be64_to_cpu(hptep[0]);
+	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+		return 0; /* nothing to do */
+
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+
+	vpte = be64_to_cpu(hptep[0]);
+
+	ret = 0;
+	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+		/* Nothing to do */
+		goto out;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		rpte = be64_to_cpu(hptep[1]);
+		vpte = hpte_new_to_old_v(vpte, rpte);
+	}
+
+	/* Unmap */
+	rev = &old->rev[idx];
+	guest_rpte = rev->guest_rpte;
+
+	ret = -EIO;
+	apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
+	if (!apsize)
+		goto out;
+
+	if (vpte & HPTE_V_VALID) {
+		unsigned long gfn = hpte_rpn(guest_rpte, apsize);
+		int srcu_idx = srcu_read_lock(&kvm->srcu);
+		struct kvm_memory_slot *memslot =
+			__gfn_to_memslot(kvm_memslots(kvm), gfn);
+
+		if (memslot) {
+			unsigned long *rmapp;
+			rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+
+			lock_rmap(rmapp);
+			kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
+			unlock_rmap(rmapp);
+		}
+
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+	}
+
+	/* Reload PTE after unmap */
+	vpte = be64_to_cpu(hptep[0]);
+	BUG_ON(vpte & HPTE_V_VALID);
+	BUG_ON(!(vpte & HPTE_V_ABSENT));
+
+	ret = 0;
+	if (!(vpte & HPTE_V_BOLTED))
+		goto out;
+
+	rpte = be64_to_cpu(hptep[1]);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		vpte = hpte_new_to_old_v(vpte, rpte);
+		rpte = hpte_new_to_old_r(rpte);
+	}
+
+	pshift = kvmppc_hpte_base_page_shift(vpte, rpte);
+	avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23);
+	pteg = idx / HPTES_PER_GROUP;
+	if (vpte & HPTE_V_SECONDARY)
+		pteg = ~pteg;
+
+	if (!(vpte & HPTE_V_1TB_SEG)) {
+		unsigned long offset, vsid;
+
+		/* We only have 28 - 23 bits of offset in avpn */
+		offset = (avpn & 0x1f) << 23;
+		vsid = avpn >> 5;
+		/* We can find more bits from the pteg value */
+		if (pshift < 23)
+			offset |= ((vsid ^ pteg) & old_hash_mask) << pshift;
+
+		hash = vsid ^ (offset >> pshift);
+	} else {
+		unsigned long offset, vsid;
+
+		/* We only have 40 - 23 bits of seg_off in avpn */
+		offset = (avpn & 0x1ffff) << 23;
+		vsid = avpn >> 17;
+		if (pshift < 23)
+			offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift;
+
+		hash = vsid ^ (vsid << 25) ^ (offset >> pshift);
+	}
+
+	new_pteg = hash & new_hash_mask;
+	if (vpte & HPTE_V_SECONDARY)
+		new_pteg = ~hash & new_hash_mask;
+
+	new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
+	new_hptep = (__be64 *)(new->virt + (new_idx << 4));
+
+	replace_vpte = be64_to_cpu(new_hptep[0]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		unsigned long replace_rpte = be64_to_cpu(new_hptep[1]);
+		replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte);
+	}
+
+	if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+		BUG_ON(new->order >= old->order);
+
+		if (replace_vpte & HPTE_V_BOLTED) {
+			if (vpte & HPTE_V_BOLTED)
+				/* Bolted collision, nothing we can do */
+				ret = -ENOSPC;
+			/* Discard the new HPTE */
+			goto out;
+		}
+
+		/* Discard the previous HPTE */
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		rpte = hpte_old_to_new_r(vpte, rpte);
+		vpte = hpte_old_to_new_v(vpte);
+	}
+
+	new_hptep[1] = cpu_to_be64(rpte);
+	new->rev[new_idx].guest_rpte = guest_rpte;
+	/* No need for a barrier, since new HPT isn't active */
+	new_hptep[0] = cpu_to_be64(vpte);
+	unlock_hpte(new_hptep, vpte);
+
+out:
+	unlock_hpte(hptep, vpte);
+	return ret;
+}
+
+static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
+{
+	struct kvm *kvm = resize->kvm;
+	unsigned  long i;
+	int rc;
+
+	for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
+		rc = resize_hpt_rehash_hpte(resize, i);
+		if (rc != 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
+{
+	struct kvm *kvm = resize->kvm;
+	struct kvm_hpt_info hpt_tmp;
+
+	/* Exchange the pending tables in the resize structure with
+	 * the active tables */
+
+	resize_hpt_debug(resize, "resize_hpt_pivot()\n");
+
+	spin_lock(&kvm->mmu_lock);
+	asm volatile("ptesync" : : : "memory");
+
+	hpt_tmp = kvm->arch.hpt;
+	kvmppc_set_hpt(kvm, &resize->hpt);
+	resize->hpt = hpt_tmp;
+
+	spin_unlock(&kvm->mmu_lock);
+
+	synchronize_srcu_expedited(&kvm->srcu);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		kvmppc_setup_partition_table(kvm);
+
+	resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
+}
+
+static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
+{
+	if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock)))
+		return;
+
+	if (!resize)
+		return;
+
+	if (resize->error != -EBUSY) {
+		if (resize->hpt.virt)
+			kvmppc_free_hpt(&resize->hpt);
+		kfree(resize);
+	}
+
+	if (kvm->arch.resize_hpt == resize)
+		kvm->arch.resize_hpt = NULL;
+}
+
+static void resize_hpt_prepare_work(struct work_struct *work)
+{
+	struct kvm_resize_hpt *resize = container_of(work,
+						     struct kvm_resize_hpt,
+						     work);
+	struct kvm *kvm = resize->kvm;
+	int err = 0;
+
+	if (WARN_ON(resize->error != -EBUSY))
+		return;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+
+	/* Request is still current? */
+	if (kvm->arch.resize_hpt == resize) {
+		/* We may request large allocations here:
+		 * do not sleep with kvm->arch.mmu_setup_lock held for a while.
+		 */
+		mutex_unlock(&kvm->arch.mmu_setup_lock);
+
+		resize_hpt_debug(resize, "%s(): order = %d\n", __func__,
+				 resize->order);
+
+		err = resize_hpt_allocate(resize);
+
+		/* We have strict assumption about -EBUSY
+		 * when preparing for HPT resize.
+		 */
+		if (WARN_ON(err == -EBUSY))
+			err = -EINPROGRESS;
+
+		mutex_lock(&kvm->arch.mmu_setup_lock);
+		/* It is possible that kvm->arch.resize_hpt != resize
+		 * after we grab kvm->arch.mmu_setup_lock again.
+		 */
+	}
+
+	resize->error = err;
+
+	if (kvm->arch.resize_hpt != resize)
+		resize_hpt_release(kvm, resize);
+
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+}
+
+int kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+				    struct kvm_ppc_resize_hpt *rhpt)
+{
+	unsigned long flags = rhpt->flags;
+	unsigned long shift = rhpt->shift;
+	struct kvm_resize_hpt *resize;
+	int ret;
+
+	if (flags != 0 || kvm_is_radix(kvm))
+		return -EINVAL;
+
+	if (shift && ((shift < 18) || (shift > 46)))
+		return -EINVAL;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+
+	resize = kvm->arch.resize_hpt;
+
+	if (resize) {
+		if (resize->order == shift) {
+			/* Suitable resize in progress? */
+			ret = resize->error;
+			if (ret == -EBUSY)
+				ret = 100; /* estimated time in ms */
+			else if (ret)
+				resize_hpt_release(kvm, resize);
+
+			goto out;
+		}
+
+		/* not suitable, cancel it */
+		resize_hpt_release(kvm, resize);
+	}
+
+	ret = 0;
+	if (!shift)
+		goto out; /* nothing to do */
+
+	/* start new resize */
+
+	resize = kzalloc(sizeof(*resize), GFP_KERNEL);
+	if (!resize) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	resize->error = -EBUSY;
+	resize->order = shift;
+	resize->kvm = kvm;
+	INIT_WORK(&resize->work, resize_hpt_prepare_work);
+	kvm->arch.resize_hpt = resize;
+
+	schedule_work(&resize->work);
+
+	ret = 100; /* estimated time in ms */
+
+out:
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+	return ret;
+}
+
+static void resize_hpt_boot_vcpu(void *opaque)
+{
+	/* Nothing to do, just force a KVM exit */
+}
+
+int kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+				   struct kvm_ppc_resize_hpt *rhpt)
+{
+	unsigned long flags = rhpt->flags;
+	unsigned long shift = rhpt->shift;
+	struct kvm_resize_hpt *resize;
+	int ret;
+
+	if (flags != 0 || kvm_is_radix(kvm))
+		return -EINVAL;
+
+	if (shift && ((shift < 18) || (shift > 46)))
+		return -EINVAL;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+
+	resize = kvm->arch.resize_hpt;
+
+	/* This shouldn't be possible */
+	ret = -EIO;
+	if (WARN_ON(!kvm->arch.mmu_ready))
+		goto out_no_hpt;
+
+	/* Stop VCPUs from running while we mess with the HPT */
+	kvm->arch.mmu_ready = 0;
+	smp_mb();
+
+	/* Boot all CPUs out of the guest so they re-read
+	 * mmu_ready */
+	on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
+
+	ret = -ENXIO;
+	if (!resize || (resize->order != shift))
+		goto out;
+
+	ret = resize->error;
+	if (ret)
+		goto out;
+
+	ret = resize_hpt_rehash(resize);
+	if (ret)
+		goto out;
+
+	resize_hpt_pivot(resize);
+
+out:
+	/* Let VCPUs run again */
+	kvm->arch.mmu_ready = 1;
+	smp_mb();
+out_no_hpt:
+	resize_hpt_release(kvm, resize);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+	return ret;
+}
+
+/*
+ * Functions for reading and writing the hash table via reads and
+ * writes on a file descriptor.
+ *
+ * Reads return the guest view of the hash table, which has to be
+ * pieced together from the real hash table and the guest_rpte
+ * values in the revmap array.
+ *
+ * On writes, each HPTE written is considered in turn, and if it
+ * is valid, it is written to the HPT as if an H_ENTER with the
+ * exact flag set was done.  When the invalid count is non-zero
+ * in the header written to the stream, the kernel will make
+ * sure that that many HPTEs are invalid, and invalidate them
+ * if not.
+ */
+
+struct kvm_htab_ctx {
+	unsigned long	index;
+	unsigned long	flags;
+	struct kvm	*kvm;
+	int		first_pass;
+};
+
+#define HPTE_SIZE	(2 * sizeof(unsigned long))
+
+/*
+ * Returns 1 if this HPT entry has been modified or has pending
+ * R/C bit changes.
+ */
+static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
+{
+	unsigned long rcbits_unset;
+
+	if (revp->guest_rpte & HPTE_GR_MODIFIED)
+		return 1;
+
+	/* Also need to consider changes in reference and changed bits */
+	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+	if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
+	    (be64_to_cpu(hptp[1]) & rcbits_unset))
+		return 1;
+
+	return 0;
+}
+
+static long record_hpte(unsigned long flags, __be64 *hptp,
+			unsigned long *hpte, struct revmap_entry *revp,
+			int want_valid, int first_pass)
+{
+	unsigned long v, r, hr;
+	unsigned long rcbits_unset;
+	int ok = 1;
+	int valid, dirty;
+
+	/* Unmodified entries are uninteresting except on the first pass */
+	dirty = hpte_dirty(revp, hptp);
+	if (!first_pass && !dirty)
+		return 0;
+
+	valid = 0;
+	if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+		valid = 1;
+		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
+		    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
+			valid = 0;
+	}
+	if (valid != want_valid)
+		return 0;
+
+	v = r = 0;
+	if (valid || dirty) {
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = be64_to_cpu(hptp[0]);
+		hr = be64_to_cpu(hptp[1]);
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			v = hpte_new_to_old_v(v, hr);
+			hr = hpte_new_to_old_r(hr);
+		}
+
+		/* re-evaluate valid and dirty from synchronized HPTE value */
+		valid = !!(v & HPTE_V_VALID);
+		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+		/* Harvest R and C into guest view if necessary */
+		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+		if (valid && (rcbits_unset & hr)) {
+			revp->guest_rpte |= (hr &
+				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
+			dirty = 1;
+		}
+
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+			valid = 1;
+		}
+		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
+			valid = 0;
+
+		r = revp->guest_rpte;
+		/* only clear modified if this is the right sort of entry */
+		if (valid == want_valid && dirty) {
+			r &= ~HPTE_GR_MODIFIED;
+			revp->guest_rpte = r;
+		}
+		unlock_hpte(hptp, be64_to_cpu(hptp[0]));
+		preempt_enable();
+		if (!(valid == want_valid && (first_pass || dirty)))
+			ok = 0;
+	}
+	hpte[0] = cpu_to_be64(v);
+	hpte[1] = cpu_to_be64(r);
+	return ok;
+}
+
+static ssize_t kvm_htab_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct kvm_htab_ctx *ctx = file->private_data;
+	struct kvm *kvm = ctx->kvm;
+	struct kvm_get_htab_header hdr;
+	__be64 *hptp;
+	struct revmap_entry *revp;
+	unsigned long i, nb, nw;
+	unsigned long __user *lbuf;
+	struct kvm_get_htab_header __user *hptr;
+	unsigned long flags;
+	int first_pass;
+	unsigned long hpte[2];
+
+	if (!access_ok(buf, count))
+		return -EFAULT;
+	if (kvm_is_radix(kvm))
+		return 0;
+
+	first_pass = ctx->first_pass;
+	flags = ctx->flags;
+
+	i = ctx->index;
+	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+	revp = kvm->arch.hpt.rev + i;
+	lbuf = (unsigned long __user *)buf;
+
+	nb = 0;
+	while (nb + sizeof(hdr) + HPTE_SIZE < count) {
+		/* Initialize header */
+		hptr = (struct kvm_get_htab_header __user *)buf;
+		hdr.n_valid = 0;
+		hdr.n_invalid = 0;
+		nw = nb;
+		nb += sizeof(hdr);
+		lbuf = (unsigned long __user *)(buf + sizeof(hdr));
+
+		/* Skip uninteresting entries, i.e. clean on not-first pass */
+		if (!first_pass) {
+			while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
+			       !hpte_dirty(revp, hptp)) {
+				++i;
+				hptp += 2;
+				++revp;
+			}
+		}
+		hdr.index = i;
+
+		/* Grab a series of valid entries */
+		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
+		       hdr.n_valid < 0xffff &&
+		       nb + HPTE_SIZE < count &&
+		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
+			/* valid entry, write it out */
+			++hdr.n_valid;
+			if (__put_user(hpte[0], lbuf) ||
+			    __put_user(hpte[1], lbuf + 1))
+				return -EFAULT;
+			nb += HPTE_SIZE;
+			lbuf += 2;
+			++i;
+			hptp += 2;
+			++revp;
+		}
+		/* Now skip invalid entries while we can */
+		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
+		       hdr.n_invalid < 0xffff &&
+		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
+			/* found an invalid entry */
+			++hdr.n_invalid;
+			++i;
+			hptp += 2;
+			++revp;
+		}
+
+		if (hdr.n_valid || hdr.n_invalid) {
+			/* write back the header */
+			if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
+				return -EFAULT;
+			nw = nb;
+			buf = (char __user *)lbuf;
+		} else {
+			nb = nw;
+		}
+
+		/* Check if we've wrapped around the hash table */
+		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
+			i = 0;
+			ctx->first_pass = 0;
+			break;
+		}
+	}
+
+	ctx->index = i;
+
+	return nb;
+}
+
+static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct kvm_htab_ctx *ctx = file->private_data;
+	struct kvm *kvm = ctx->kvm;
+	struct kvm_get_htab_header hdr;
+	unsigned long i, j;
+	unsigned long v, r;
+	unsigned long __user *lbuf;
+	__be64 *hptp;
+	unsigned long tmp[2];
+	ssize_t nb;
+	long int err, ret;
+	int mmu_ready;
+	int pshift;
+
+	if (!access_ok(buf, count))
+		return -EFAULT;
+	if (kvm_is_radix(kvm))
+		return -EINVAL;
+
+	/* lock out vcpus from running while we're doing this */
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+	mmu_ready = kvm->arch.mmu_ready;
+	if (mmu_ready) {
+		kvm->arch.mmu_ready = 0;	/* temporarily */
+		/* order mmu_ready vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.mmu_ready = 1;
+			mutex_unlock(&kvm->arch.mmu_setup_lock);
+			return -EBUSY;
+		}
+	}
+
+	err = 0;
+	for (nb = 0; nb + sizeof(hdr) <= count; ) {
+		err = -EFAULT;
+		if (__copy_from_user(&hdr, buf, sizeof(hdr)))
+			break;
+
+		err = 0;
+		if (nb + hdr.n_valid * HPTE_SIZE > count)
+			break;
+
+		nb += sizeof(hdr);
+		buf += sizeof(hdr);
+
+		err = -EINVAL;
+		i = hdr.index;
+		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
+		    i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
+			break;
+
+		hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+		lbuf = (unsigned long __user *)buf;
+		for (j = 0; j < hdr.n_valid; ++j) {
+			__be64 hpte_v;
+			__be64 hpte_r;
+
+			err = -EFAULT;
+			if (__get_user(hpte_v, lbuf) ||
+			    __get_user(hpte_r, lbuf + 1))
+				goto out;
+			v = be64_to_cpu(hpte_v);
+			r = be64_to_cpu(hpte_r);
+			err = -EINVAL;
+			if (!(v & HPTE_V_VALID))
+				goto out;
+			pshift = kvmppc_hpte_base_page_shift(v, r);
+			if (pshift <= 0)
+				goto out;
+			lbuf += 2;
+			nb += HPTE_SIZE;
+
+			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
+				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+			err = -EIO;
+			ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
+							 tmp);
+			if (ret != H_SUCCESS) {
+				pr_err("%s ret %ld i=%ld v=%lx r=%lx\n", __func__, ret, i, v, r);
+				goto out;
+			}
+			if (!mmu_ready && is_vrma_hpte(v)) {
+				unsigned long senc, lpcr;
+
+				senc = slb_pgsize_encoding(1ul << pshift);
+				kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+					(VRMA_VSID << SLB_VSID_SHIFT_1T);
+				if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+					lpcr = senc << (LPCR_VRMASD_SH - 4);
+					kvmppc_update_lpcr(kvm, lpcr,
+							   LPCR_VRMASD);
+				} else {
+					kvmppc_setup_partition_table(kvm);
+				}
+				mmu_ready = 1;
+			}
+			++i;
+			hptp += 2;
+		}
+
+		for (j = 0; j < hdr.n_invalid; ++j) {
+			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
+				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+			++i;
+			hptp += 2;
+		}
+		err = 0;
+	}
+
+ out:
+	/* Order HPTE updates vs. mmu_ready */
+	smp_wmb();
+	kvm->arch.mmu_ready = mmu_ready;
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+
+	if (err)
+		return err;
+	return nb;
+}
+
+static int kvm_htab_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_htab_ctx *ctx = filp->private_data;
+
+	filp->private_data = NULL;
+	if (!(ctx->flags & KVM_GET_HTAB_WRITE))
+		atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
+	kvm_put_kvm(ctx->kvm);
+	kfree(ctx);
+	return 0;
+}
+
+static const struct file_operations kvm_htab_fops = {
+	.read		= kvm_htab_read,
+	.write		= kvm_htab_write,
+	.llseek		= default_llseek,
+	.release	= kvm_htab_release,
+};
+
+int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
+{
+	int ret;
+	struct kvm_htab_ctx *ctx;
+	int rwflag;
+
+	/* reject flags we don't recognize */
+	if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
+		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	kvm_get_kvm(kvm);
+	ctx->kvm = kvm;
+	ctx->index = ghf->start_index;
+	ctx->flags = ghf->flags;
+	ctx->first_pass = 1;
+
+	rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
+	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
+	if (ret < 0) {
+		kfree(ctx);
+		kvm_put_kvm_no_destroy(kvm);
+		return ret;
+	}
+
+	if (rwflag == O_RDONLY) {
+		mutex_lock(&kvm->slots_lock);
+		atomic_inc(&kvm->arch.hpte_mod_interest);
+		/* make sure kvmppc_do_h_enter etc. see the increment */
+		synchronize_srcu_expedited(&kvm->srcu);
+		mutex_unlock(&kvm->slots_lock);
+	}
+
+	return ret;
+}
+
+struct debugfs_htab_state {
+	struct kvm	*kvm;
+	struct mutex	mutex;
+	unsigned long	hpt_index;
+	int		chars_left;
+	int		buf_index;
+	char		buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm = inode->i_private;
+	struct debugfs_htab_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(kvm);
+	p->kvm = kvm;
+	mutex_init(&p->mutex);
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_htab_state *p = file->private_data;
+
+	kvm_put_kvm(p->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	struct debugfs_htab_state *p = file->private_data;
+	ssize_t ret, r;
+	unsigned long i, n;
+	unsigned long v, hr, gr;
+	struct kvm *kvm;
+	__be64 *hptp;
+
+	kvm = p->kvm;
+	if (kvm_is_radix(kvm))
+		return 0;
+
+	ret = mutex_lock_interruptible(&p->mutex);
+	if (ret)
+		return ret;
+
+	if (p->chars_left) {
+		n = p->chars_left;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf + p->buf_index, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index += n;
+		buf += n;
+		len -= n;
+		ret = n;
+		if (r) {
+			if (!n)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	i = p->hpt_index;
+	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+	for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
+	     ++i, hptp += 2) {
+		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+		hr = be64_to_cpu(hptp[1]);
+		gr = kvm->arch.hpt.rev[i].guest_rpte;
+		unlock_hpte(hptp, v);
+		preempt_enable();
+
+		if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		n = scnprintf(p->buf, sizeof(p->buf),
+			      "%6lx %.16lx %.16lx %.16lx\n",
+			      i, v, hr, gr);
+		p->chars_left = n;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index = n;
+		buf += n;
+		len -= n;
+		ret += n;
+		if (r) {
+			if (!ret)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+	p->hpt_index = i;
+
+ out:
+	mutex_unlock(&p->mutex);
+	return ret;
+}
+
+static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_htab_open,
+	.release = debugfs_htab_release,
+	.read	 = debugfs_htab_read,
+	.write	 = debugfs_htab_write,
+	.llseek	 = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+	debugfs_create_file("htab", 0400, kvm->debugfs_dentry, kvm,
+			    &debugfs_htab_fops);
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
+
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
new file mode 100644
index 0000000000..10aacbf924
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -0,0 +1,1492 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
+#include <linux/pgtable.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include "book3s_hv.h"
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/ultravisor.h>
+#include <asm/kvm_book3s_uvmem.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/firmware.h>
+
+/*
+ * Supported radix tree geometry.
+ * Like p9, we support either 5 or 9 bits at the first (lowest) level,
+ * for a page size of 64k or 4k.
+ */
+static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
+
+unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
+					      gva_t eaddr, void *to, void *from,
+					      unsigned long n)
+{
+	int old_pid, old_lpid;
+	unsigned long quadrant, ret = n;
+	bool is_load = !!to;
+
+	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
+	if (kvmhv_on_pseries())
+		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
+					  (to != NULL) ? __pa(to): 0,
+					  (from != NULL) ? __pa(from): 0, n);
+
+	if (eaddr & (0xFFFUL << 52))
+		return ret;
+
+	quadrant = 1;
+	if (!pid)
+		quadrant = 2;
+	if (is_load)
+		from = (void *) (eaddr | (quadrant << 62));
+	else
+		to = (void *) (eaddr | (quadrant << 62));
+
+	preempt_disable();
+
+	asm volatile("hwsync" ::: "memory");
+	isync();
+	/* switch the lpid first to avoid running host with unallocated pid */
+	old_lpid = mfspr(SPRN_LPID);
+	if (old_lpid != lpid)
+		mtspr(SPRN_LPID, lpid);
+	if (quadrant == 1) {
+		old_pid = mfspr(SPRN_PID);
+		if (old_pid != pid)
+			mtspr(SPRN_PID, pid);
+	}
+	isync();
+
+	pagefault_disable();
+	if (is_load)
+		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
+	else
+		ret = __copy_to_user_inatomic((void __user *)to, from, n);
+	pagefault_enable();
+
+	asm volatile("hwsync" ::: "memory");
+	isync();
+	/* switch the pid first to avoid running host with unallocated pid */
+	if (quadrant == 1 && pid != old_pid)
+		mtspr(SPRN_PID, old_pid);
+	if (lpid != old_lpid)
+		mtspr(SPRN_LPID, old_lpid);
+	isync();
+
+	preempt_enable();
+
+	return ret;
+}
+
+static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
+					  void *to, void *from, unsigned long n)
+{
+	int lpid = vcpu->kvm->arch.lpid;
+	int pid = vcpu->arch.pid;
+
+	/* This would cause a data segment intr so don't allow the access */
+	if (eaddr & (0x3FFUL << 52))
+		return -EINVAL;
+
+	/* Should we be using the nested lpid */
+	if (vcpu->arch.nested)
+		lpid = vcpu->arch.nested->shadow_lpid;
+
+	/* If accessing quadrant 3 then pid is expected to be 0 */
+	if (((eaddr >> 62) & 0x3) == 0x3)
+		pid = 0;
+
+	eaddr &= ~(0xFFFUL << 52);
+
+	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
+}
+
+long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
+				 unsigned long n)
+{
+	long ret;
+
+	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
+	if (ret > 0)
+		memset(to + (n - ret), 0, ret);
+
+	return ret;
+}
+
+long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
+			       unsigned long n)
+{
+	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
+}
+
+int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+			       struct kvmppc_pte *gpte, u64 root,
+			       u64 *pte_ret_p)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int ret, level, ps;
+	unsigned long rts, bits, offset, index;
+	u64 pte, base, gpa;
+	__be64 rpte;
+
+	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
+		((root & RTS2_MASK) >> RTS2_SHIFT);
+	bits = root & RPDS_MASK;
+	base = root & RPDB_MASK;
+
+	offset = rts + 31;
+
+	/* Current implementations only support 52-bit space */
+	if (offset != 52)
+		return -EINVAL;
+
+	/* Walk each level of the radix tree */
+	for (level = 3; level >= 0; --level) {
+		u64 addr;
+		/* Check a valid size */
+		if (level && bits != p9_supported_radix_bits[level])
+			return -EINVAL;
+		if (level == 0 && !(bits == 5 || bits == 9))
+			return -EINVAL;
+		offset -= bits;
+		index = (eaddr >> offset) & ((1UL << bits) - 1);
+		/* Check that low bits of page table base are zero */
+		if (base & ((1UL << (bits + 3)) - 1))
+			return -EINVAL;
+		/* Read the entry from guest memory */
+		addr = base + (index * sizeof(rpte));
+
+		kvm_vcpu_srcu_read_lock(vcpu);
+		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
+		kvm_vcpu_srcu_read_unlock(vcpu);
+		if (ret) {
+			if (pte_ret_p)
+				*pte_ret_p = addr;
+			return ret;
+		}
+		pte = __be64_to_cpu(rpte);
+		if (!(pte & _PAGE_PRESENT))
+			return -ENOENT;
+		/* Check if a leaf entry */
+		if (pte & _PAGE_PTE)
+			break;
+		/* Get ready to walk the next level */
+		base = pte & RPDB_MASK;
+		bits = pte & RPDS_MASK;
+	}
+
+	/* Need a leaf at lowest level; 512GB pages not supported */
+	if (level < 0 || level == 3)
+		return -EINVAL;
+
+	/* We found a valid leaf PTE */
+	/* Offset is now log base 2 of the page size */
+	gpa = pte & 0x01fffffffffff000ul;
+	if (gpa & ((1ul << offset) - 1))
+		return -EINVAL;
+	gpa |= eaddr & ((1ul << offset) - 1);
+	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
+		if (offset == mmu_psize_defs[ps].shift)
+			break;
+	gpte->page_size = ps;
+	gpte->page_shift = offset;
+
+	gpte->eaddr = eaddr;
+	gpte->raddr = gpa;
+
+	/* Work out permissions */
+	gpte->may_read = !!(pte & _PAGE_READ);
+	gpte->may_write = !!(pte & _PAGE_WRITE);
+	gpte->may_execute = !!(pte & _PAGE_EXEC);
+
+	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
+
+	if (pte_ret_p)
+		*pte_ret_p = pte;
+
+	return 0;
+}
+
+/*
+ * Used to walk a partition or process table radix tree in guest memory
+ * Note: We exploit the fact that a partition table and a process
+ * table have the same layout, a partition-scoped page table and a
+ * process-scoped page table have the same layout, and the 2nd
+ * doubleword of a partition table entry has the same layout as
+ * the PTCR register.
+ */
+int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+				     struct kvmppc_pte *gpte, u64 table,
+				     int table_index, u64 *pte_ret_p)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int ret;
+	unsigned long size, ptbl, root;
+	struct prtb_entry entry;
+
+	if ((table & PRTS_MASK) > 24)
+		return -EINVAL;
+	size = 1ul << ((table & PRTS_MASK) + 12);
+
+	/* Is the table big enough to contain this entry? */
+	if ((table_index * sizeof(entry)) >= size)
+		return -EINVAL;
+
+	/* Read the table to find the root of the radix tree */
+	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
+	kvm_vcpu_srcu_read_lock(vcpu);
+	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
+	kvm_vcpu_srcu_read_unlock(vcpu);
+	if (ret)
+		return ret;
+
+	/* Root is stored in the first double word */
+	root = be64_to_cpu(entry.prtb0);
+
+	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
+}
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			   struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+	u32 pid;
+	u64 pte;
+	int ret;
+
+	/* Work out effective PID */
+	switch (eaddr >> 62) {
+	case 0:
+		pid = vcpu->arch.pid;
+		break;
+	case 3:
+		pid = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
+				vcpu->kvm->arch.process_table, pid, &pte);
+	if (ret)
+		return ret;
+
+	/* Check privilege (applies only to process scoped translations) */
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+		if (pte & _PAGE_PRIVILEGED) {
+			gpte->may_read = 0;
+			gpte->may_write = 0;
+			gpte->may_execute = 0;
+		}
+	} else {
+		if (!(pte & _PAGE_PRIVILEGED)) {
+			/* Check AMR/IAMR to see if strict mode is in force */
+			if (kvmppc_get_amr_hv(vcpu) & (1ul << 62))
+				gpte->may_read = 0;
+			if (kvmppc_get_amr_hv(vcpu) & (1ul << 63))
+				gpte->may_write = 0;
+			if (vcpu->arch.iamr & (1ul << 62))
+				gpte->may_execute = 0;
+		}
+	}
+
+	return 0;
+}
+
+void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
+			     unsigned int pshift, unsigned int lpid)
+{
+	unsigned long psize = PAGE_SIZE;
+	int psi;
+	long rc;
+	unsigned long rb;
+
+	if (pshift)
+		psize = 1UL << pshift;
+	else
+		pshift = PAGE_SHIFT;
+
+	addr &= ~(psize - 1);
+
+	if (!kvmhv_on_pseries()) {
+		radix__flush_tlb_lpid_page(lpid, addr, psize);
+		return;
+	}
+
+	psi = shift_to_mmu_psize(pshift);
+
+	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
+		rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
+		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
+					lpid, rb);
+	} else {
+		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
+					    H_RPTI_TYPE_NESTED |
+					    H_RPTI_TYPE_TLB,
+					    psize_to_rpti_pgsize(psi),
+					    addr, addr + psize);
+	}
+
+	if (rc)
+		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
+}
+
+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
+{
+	long rc;
+
+	if (!kvmhv_on_pseries()) {
+		radix__flush_pwc_lpid(lpid);
+		return;
+	}
+
+	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
+		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
+					lpid, TLBIEL_INVAL_SET_LPID);
+	else
+		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
+					    H_RPTI_TYPE_NESTED |
+					    H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
+					    0, -1UL);
+	if (rc)
+		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
+}
+
+static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
+				      unsigned long clr, unsigned long set,
+				      unsigned long addr, unsigned int shift)
+{
+	return __radix_pte_update(ptep, clr, set);
+}
+
+static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+			     pte_t *ptep, pte_t pte)
+{
+	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
+}
+
+static struct kmem_cache *kvm_pte_cache;
+static struct kmem_cache *kvm_pmd_cache;
+
+static pte_t *kvmppc_pte_alloc(void)
+{
+	pte_t *pte;
+
+	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
+	/* pmd_populate() will only reference _pa(pte). */
+	kmemleak_ignore(pte);
+
+	return pte;
+}
+
+static void kvmppc_pte_free(pte_t *ptep)
+{
+	kmem_cache_free(kvm_pte_cache, ptep);
+}
+
+static pmd_t *kvmppc_pmd_alloc(void)
+{
+	pmd_t *pmd;
+
+	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
+	/* pud_populate() will only reference _pa(pmd). */
+	kmemleak_ignore(pmd);
+
+	return pmd;
+}
+
+static void kvmppc_pmd_free(pmd_t *pmdp)
+{
+	kmem_cache_free(kvm_pmd_cache, pmdp);
+}
+
+/* Called with kvm->mmu_lock held */
+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+		      unsigned int shift,
+		      const struct kvm_memory_slot *memslot,
+		      unsigned int lpid)
+
+{
+	unsigned long old;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	unsigned long page_size = PAGE_SIZE;
+	unsigned long hpa;
+
+	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
+	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
+
+	/* The following only applies to L1 entries */
+	if (lpid != kvm->arch.lpid)
+		return;
+
+	if (!memslot) {
+		memslot = gfn_to_memslot(kvm, gfn);
+		if (!memslot)
+			return;
+	}
+	if (shift) { /* 1GB or 2MB page */
+		page_size = 1ul << shift;
+		if (shift == PMD_SHIFT)
+			kvm->stat.num_2M_pages--;
+		else if (shift == PUD_SHIFT)
+			kvm->stat.num_1G_pages--;
+	}
+
+	gpa &= ~(page_size - 1);
+	hpa = old & PTE_RPN_MASK;
+	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
+
+	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
+		kvmppc_update_dirty_map(memslot, gfn, page_size);
+}
+
+/*
+ * kvmppc_free_p?d are used to free existing page tables, and recursively
+ * descend and clear and free children.
+ * Callers are responsible for flushing the PWC.
+ *
+ * When page tables are being unmapped/freed as part of page fault path
+ * (full == false), valid ptes are generally not expected; however, there
+ * is one situation where they arise, which is when dirty page logging is
+ * turned off for a memslot while the VM is running.  The new memslot
+ * becomes visible to page faults before the memslot commit function
+ * gets to flush the memslot, which can lead to a 2MB page mapping being
+ * installed for a guest physical address where there are already 64kB
+ * (or 4kB) mappings (of sub-pages of the same 2MB page).
+ */
+static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
+				  unsigned int lpid)
+{
+	if (full) {
+		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
+	} else {
+		pte_t *p = pte;
+		unsigned long it;
+
+		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
+			if (pte_val(*p) == 0)
+				continue;
+			kvmppc_unmap_pte(kvm, p,
+					 pte_pfn(*p) << PAGE_SHIFT,
+					 PAGE_SHIFT, NULL, lpid);
+		}
+	}
+
+	kvmppc_pte_free(pte);
+}
+
+static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
+				  unsigned int lpid)
+{
+	unsigned long im;
+	pmd_t *p = pmd;
+
+	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
+		if (!pmd_present(*p))
+			continue;
+		if (pmd_is_leaf(*p)) {
+			if (full) {
+				pmd_clear(p);
+			} else {
+				WARN_ON_ONCE(1);
+				kvmppc_unmap_pte(kvm, (pte_t *)p,
+					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
+					 PMD_SHIFT, NULL, lpid);
+			}
+		} else {
+			pte_t *pte;
+
+			pte = pte_offset_kernel(p, 0);
+			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
+			pmd_clear(p);
+		}
+	}
+	kvmppc_pmd_free(pmd);
+}
+
+static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
+				  unsigned int lpid)
+{
+	unsigned long iu;
+	pud_t *p = pud;
+
+	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
+		if (!pud_present(*p))
+			continue;
+		if (pud_is_leaf(*p)) {
+			pud_clear(p);
+		} else {
+			pmd_t *pmd;
+
+			pmd = pmd_offset(p, 0);
+			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
+			pud_clear(p);
+		}
+	}
+	pud_free(kvm->mm, pud);
+}
+
+void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
+{
+	unsigned long ig;
+
+	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
+		p4d_t *p4d = p4d_offset(pgd, 0);
+		pud_t *pud;
+
+		if (!p4d_present(*p4d))
+			continue;
+		pud = pud_offset(p4d, 0);
+		kvmppc_unmap_free_pud(kvm, pud, lpid);
+		p4d_clear(p4d);
+	}
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+	if (kvm->arch.pgtable) {
+		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
+					  kvm->arch.lpid);
+		pgd_free(kvm->mm, kvm->arch.pgtable);
+		kvm->arch.pgtable = NULL;
+	}
+}
+
+static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
+					unsigned long gpa, unsigned int lpid)
+{
+	pte_t *pte = pte_offset_kernel(pmd, 0);
+
+	/*
+	 * Clearing the pmd entry then flushing the PWC ensures that the pte
+	 * page no longer be cached by the MMU, so can be freed without
+	 * flushing the PWC again.
+	 */
+	pmd_clear(pmd);
+	kvmppc_radix_flush_pwc(kvm, lpid);
+
+	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
+}
+
+static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
+					unsigned long gpa, unsigned int lpid)
+{
+	pmd_t *pmd = pmd_offset(pud, 0);
+
+	/*
+	 * Clearing the pud entry then flushing the PWC ensures that the pmd
+	 * page and any children pte pages will no longer be cached by the MMU,
+	 * so can be freed without flushing the PWC again.
+	 */
+	pud_clear(pud);
+	kvmppc_radix_flush_pwc(kvm, lpid);
+
+	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
+}
+
+/*
+ * There are a number of bits which may differ between different faults to
+ * the same partition scope entry. RC bits, in the course of cleaning and
+ * aging. And the write bit can change, either the access could have been
+ * upgraded, or a read fault could happen concurrently with a write fault
+ * that sets those bits first.
+ */
+#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
+
+int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+		      unsigned long gpa, unsigned int level,
+		      unsigned long mmu_seq, unsigned int lpid,
+		      unsigned long *rmapp, struct rmap_nested **n_rmap)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud, *new_pud = NULL;
+	pmd_t *pmd, *new_pmd = NULL;
+	pte_t *ptep, *new_ptep = NULL;
+	int ret;
+
+	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
+	pgd = pgtable + pgd_index(gpa);
+	p4d = p4d_offset(pgd, gpa);
+
+	pud = NULL;
+	if (p4d_present(*p4d))
+		pud = pud_offset(p4d, gpa);
+	else
+		new_pud = pud_alloc_one(kvm->mm, gpa);
+
+	pmd = NULL;
+	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
+		pmd = pmd_offset(pud, gpa);
+	else if (level <= 1)
+		new_pmd = kvmppc_pmd_alloc();
+
+	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
+		new_ptep = kvmppc_pte_alloc();
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	spin_lock(&kvm->mmu_lock);
+	ret = -EAGAIN;
+	if (mmu_invalidate_retry(kvm, mmu_seq))
+		goto out_unlock;
+
+	/* Now traverse again under the lock and change the tree */
+	ret = -ENOMEM;
+	if (p4d_none(*p4d)) {
+		if (!new_pud)
+			goto out_unlock;
+		p4d_populate(kvm->mm, p4d, new_pud);
+		new_pud = NULL;
+	}
+	pud = pud_offset(p4d, gpa);
+	if (pud_is_leaf(*pud)) {
+		unsigned long hgpa = gpa & PUD_MASK;
+
+		/* Check if we raced and someone else has set the same thing */
+		if (level == 2) {
+			if (pud_raw(*pud) == pte_raw(pte)) {
+				ret = 0;
+				goto out_unlock;
+			}
+			/* Valid 1GB page here already, add our extra bits */
+			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
+							PTE_BITS_MUST_MATCH);
+			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
+					      0, pte_val(pte), hgpa, PUD_SHIFT);
+			ret = 0;
+			goto out_unlock;
+		}
+		/*
+		 * If we raced with another CPU which has just put
+		 * a 1GB pte in after we saw a pmd page, try again.
+		 */
+		if (!new_pmd) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+		/* Valid 1GB page here already, remove it */
+		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
+				 lpid);
+	}
+	if (level == 2) {
+		if (!pud_none(*pud)) {
+			/*
+			 * There's a page table page here, but we wanted to
+			 * install a large page, so remove and free the page
+			 * table page.
+			 */
+			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
+		}
+		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+		if (rmapp && n_rmap)
+			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
+		ret = 0;
+		goto out_unlock;
+	}
+	if (pud_none(*pud)) {
+		if (!new_pmd)
+			goto out_unlock;
+		pud_populate(kvm->mm, pud, new_pmd);
+		new_pmd = NULL;
+	}
+	pmd = pmd_offset(pud, gpa);
+	if (pmd_is_leaf(*pmd)) {
+		unsigned long lgpa = gpa & PMD_MASK;
+
+		/* Check if we raced and someone else has set the same thing */
+		if (level == 1) {
+			if (pmd_raw(*pmd) == pte_raw(pte)) {
+				ret = 0;
+				goto out_unlock;
+			}
+			/* Valid 2MB page here already, add our extra bits */
+			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
+							PTE_BITS_MUST_MATCH);
+			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
+					0, pte_val(pte), lgpa, PMD_SHIFT);
+			ret = 0;
+			goto out_unlock;
+		}
+
+		/*
+		 * If we raced with another CPU which has just put
+		 * a 2MB pte in after we saw a pte page, try again.
+		 */
+		if (!new_ptep) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+		/* Valid 2MB page here already, remove it */
+		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
+				 lpid);
+	}
+	if (level == 1) {
+		if (!pmd_none(*pmd)) {
+			/*
+			 * There's a page table page here, but we wanted to
+			 * install a large page, so remove and free the page
+			 * table page.
+			 */
+			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
+		}
+		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+		if (rmapp && n_rmap)
+			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
+		ret = 0;
+		goto out_unlock;
+	}
+	if (pmd_none(*pmd)) {
+		if (!new_ptep)
+			goto out_unlock;
+		pmd_populate(kvm->mm, pmd, new_ptep);
+		new_ptep = NULL;
+	}
+	ptep = pte_offset_kernel(pmd, gpa);
+	if (pte_present(*ptep)) {
+		/* Check if someone else set the same thing */
+		if (pte_raw(*ptep) == pte_raw(pte)) {
+			ret = 0;
+			goto out_unlock;
+		}
+		/* Valid page here already, add our extra bits */
+		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
+							PTE_BITS_MUST_MATCH);
+		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
+		ret = 0;
+		goto out_unlock;
+	}
+	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+	if (rmapp && n_rmap)
+		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
+	ret = 0;
+
+ out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	if (new_pud)
+		pud_free(kvm->mm, new_pud);
+	if (new_pmd)
+		kvmppc_pmd_free(new_pmd);
+	if (new_ptep)
+		kvmppc_pte_free(new_ptep);
+	return ret;
+}
+
+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
+			     unsigned long gpa, unsigned int lpid)
+{
+	unsigned long pgflags;
+	unsigned int shift;
+	pte_t *ptep;
+
+	/*
+	 * Need to set an R or C bit in the 2nd-level tables;
+	 * since we are just helping out the hardware here,
+	 * it is sufficient to do what the hardware does.
+	 */
+	pgflags = _PAGE_ACCESSED;
+	if (writing)
+		pgflags |= _PAGE_DIRTY;
+
+	if (nested)
+		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
+	else
+		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
+
+	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
+		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
+		return true;
+	}
+	return false;
+}
+
+int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+				   unsigned long gpa,
+				   struct kvm_memory_slot *memslot,
+				   bool writing, bool kvm_ro,
+				   pte_t *inserted_pte, unsigned int *levelp)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct page *page = NULL;
+	unsigned long mmu_seq;
+	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
+	bool upgrade_write = false;
+	bool *upgrade_p = &upgrade_write;
+	pte_t pte, *ptep;
+	unsigned int shift, level;
+	int ret;
+	bool large_enable;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	/*
+	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
+	 * do it with !atomic && !async, which is how we call it.
+	 * We always ask for write permission since the common case
+	 * is that the page is writable.
+	 */
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
+		upgrade_write = true;
+	} else {
+		unsigned long pfn;
+
+		/* Call KVM generic code to do the slow-path check */
+		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
+					   writing, upgrade_p, NULL);
+		if (is_error_noslot_pfn(pfn))
+			return -EFAULT;
+		page = NULL;
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (PageReserved(page))
+				page = NULL;
+		}
+	}
+
+	/*
+	 * Read the PTE from the process' radix tree and use that
+	 * so we get the shift and attribute bits.
+	 */
+	spin_lock(&kvm->mmu_lock);
+	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
+	pte = __pte(0);
+	if (ptep)
+		pte = READ_ONCE(*ptep);
+	spin_unlock(&kvm->mmu_lock);
+	/*
+	 * If the PTE disappeared temporarily due to a THP
+	 * collapse, just return and let the guest try again.
+	 */
+	if (!pte_present(pte)) {
+		if (page)
+			put_page(page);
+		return RESUME_GUEST;
+	}
+
+	/* If we're logging dirty pages, always map single pages */
+	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
+
+	/* Get pte level from shift/size */
+	if (large_enable && shift == PUD_SHIFT &&
+	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+	    (hva & (PUD_SIZE - PAGE_SIZE))) {
+		level = 2;
+	} else if (large_enable && shift == PMD_SHIFT &&
+		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+		   (hva & (PMD_SIZE - PAGE_SIZE))) {
+		level = 1;
+	} else {
+		level = 0;
+		if (shift > PAGE_SHIFT) {
+			/*
+			 * If the pte maps more than one page, bring over
+			 * bits from the virtual address to get the real
+			 * address of the specific single page we want.
+			 */
+			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+			pte = __pte(pte_val(pte) | (hva & rpnmask));
+		}
+	}
+
+	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
+	if (writing || upgrade_write) {
+		if (pte_val(pte) & _PAGE_WRITE)
+			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
+	} else {
+		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
+	}
+
+	/* Allocate space in the tree and write the PTE */
+	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
+				mmu_seq, kvm->arch.lpid, NULL, NULL);
+	if (inserted_pte)
+		*inserted_pte = pte;
+	if (levelp)
+		*levelp = level;
+
+	if (page) {
+		if (!ret && (pte_val(pte) & _PAGE_WRITE))
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+
+	/* Increment number of large pages if we (successfully) inserted one */
+	if (!ret) {
+		if (level == 1)
+			kvm->stat.num_2M_pages++;
+		else if (level == 2)
+			kvm->stat.num_1G_pages++;
+	}
+
+	return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
+				   unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long gpa, gfn;
+	struct kvm_memory_slot *memslot;
+	long ret;
+	bool writing = !!(dsisr & DSISR_ISSTORE);
+	bool kvm_ro = false;
+
+	/* Check for unusual errors */
+	if (dsisr & DSISR_UNSUPP_MMU) {
+		pr_err("KVM: Got unsupported MMU fault\n");
+		return -EFAULT;
+	}
+	if (dsisr & DSISR_BADACCESS) {
+		/* Reflect to the guest as DSI */
+		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+		kvmppc_core_queue_data_storage(vcpu,
+				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+				ea, dsisr);
+		return RESUME_GUEST;
+	}
+
+	/* Translate the logical address */
+	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+	gpa &= ~0xF000000000000000ul;
+	gfn = gpa >> PAGE_SHIFT;
+	if (!(dsisr & DSISR_PRTABLE_FAULT))
+		gpa |= ea & 0xfff;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
+		return kvmppc_send_page_to_uv(kvm, gfn);
+
+	/* Get the corresponding memslot */
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
+			     DSISR_SET_RC)) {
+			/*
+			 * Bad address in guest page table tree, or other
+			 * unusual error - reflect it to the guest as DSI.
+			 */
+			kvmppc_core_queue_data_storage(vcpu,
+					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+					ea, dsisr);
+			return RESUME_GUEST;
+		}
+		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
+	}
+
+	if (memslot->flags & KVM_MEM_READONLY) {
+		if (writing) {
+			/* give the guest a DSI */
+			kvmppc_core_queue_data_storage(vcpu,
+					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+					ea, DSISR_ISSTORE | DSISR_PROTFAULT);
+			return RESUME_GUEST;
+		}
+		kvm_ro = true;
+	}
+
+	/* Failed to set the reference/change bits */
+	if (dsisr & DSISR_SET_RC) {
+		spin_lock(&kvm->mmu_lock);
+		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
+					    gpa, kvm->arch.lpid))
+			dsisr &= ~DSISR_SET_RC;
+		spin_unlock(&kvm->mmu_lock);
+
+		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+			       DSISR_PROTFAULT | DSISR_SET_RC)))
+			return RESUME_GUEST;
+	}
+
+	/* Try to insert a pte */
+	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
+					     kvm_ro, NULL, NULL);
+
+	if (ret == 0 || ret == -EAGAIN)
+		ret = RESUME_GUEST;
+	return ret;
+}
+
+/* Called with kvm->mmu_lock held */
+void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		     unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
+		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
+		return;
+	}
+
+	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
+	if (ptep && pte_present(*ptep))
+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+				 kvm->arch.lpid);
+}
+
+/* Called with kvm->mmu_lock held */
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		   unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	bool ref = false;
+	unsigned long old, *rmapp;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
+		return ref;
+
+	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
+		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
+					      gpa, shift);
+		/* XXX need to flush tlb here? */
+		/* Also clear bit in ptes in shadow pgtable for nested guests */
+		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
+					       old & PTE_RPN_MASK,
+					       1UL << shift);
+		ref = true;
+	}
+	return ref;
+}
+
+/* Called with kvm->mmu_lock held */
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn)
+
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	bool ref = false;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
+		return ref;
+
+	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep))
+		ref = true;
+	return ref;
+}
+
+/* Returns the number of PAGE_SIZE pages that are dirty */
+static int kvm_radix_test_clear_dirty(struct kvm *kvm,
+				struct kvm_memory_slot *memslot, int pagenum)
+{
+	unsigned long gfn = memslot->base_gfn + pagenum;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	pte_t *ptep, pte;
+	unsigned int shift;
+	int ret = 0;
+	unsigned long old, *rmapp;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
+		return ret;
+
+	/*
+	 * For performance reasons we don't hold kvm->mmu_lock while walking the
+	 * partition scoped table.
+	 */
+	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
+	if (!ptep)
+		return 0;
+
+	pte = READ_ONCE(*ptep);
+	if (pte_present(pte) && pte_dirty(pte)) {
+		spin_lock(&kvm->mmu_lock);
+		/*
+		 * Recheck the pte again
+		 */
+		if (pte_val(pte) != pte_val(*ptep)) {
+			/*
+			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
+			 * only find PAGE_SIZE pte entries here. We can continue
+			 * to use the pte addr returned by above page table
+			 * walk.
+			 */
+			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
+				spin_unlock(&kvm->mmu_lock);
+				return 0;
+			}
+		}
+
+		ret = 1;
+		VM_BUG_ON(shift);
+		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
+					      gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
+		/* Also clear bit in ptes in shadow pgtable for nested guests */
+		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
+					       old & PTE_RPN_MASK,
+					       1UL << shift);
+		spin_unlock(&kvm->mmu_lock);
+	}
+	return ret;
+}
+
+long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
+{
+	unsigned long i, j;
+	int npages;
+
+	for (i = 0; i < memslot->npages; i = j) {
+		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
+
+		/*
+		 * Note that if npages > 0 then i must be a multiple of npages,
+		 * since huge pages are only used to back the guest at guest
+		 * real addresses that are a multiple of their size.
+		 * Since we have at most one PTE covering any given guest
+		 * real address, if npages > 1 we can skip to i + npages.
+		 */
+		j = i + 1;
+		if (npages) {
+			set_dirty_bits(map, i, npages);
+			j = i + npages;
+		}
+	}
+	return 0;
+}
+
+void kvmppc_radix_flush_memslot(struct kvm *kvm,
+				const struct kvm_memory_slot *memslot)
+{
+	unsigned long n;
+	pte_t *ptep;
+	unsigned long gpa;
+	unsigned int shift;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
+		kvmppc_uvmem_drop_pages(memslot, kvm, true);
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
+		return;
+
+	gpa = memslot->base_gfn << PAGE_SHIFT;
+	spin_lock(&kvm->mmu_lock);
+	for (n = memslot->npages; n; --n) {
+		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
+		if (ptep && pte_present(*ptep))
+			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+					 kvm->arch.lpid);
+		gpa += PAGE_SIZE;
+	}
+	/*
+	 * Increase the mmu notifier sequence number to prevent any page
+	 * fault that read the memslot earlier from writing a PTE.
+	 */
+	kvm->mmu_invalidate_seq++;
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
+				 int psize, int *indexp)
+{
+	if (!mmu_psize_defs[psize].shift)
+		return;
+	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
+		(mmu_psize_defs[psize].ap << 29);
+	++(*indexp);
+}
+
+int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	int i;
+
+	if (!radix_enabled())
+		return -EINVAL;
+	memset(info, 0, sizeof(*info));
+
+	/* 4k page size */
+	info->geometries[0].page_shift = 12;
+	info->geometries[0].level_bits[0] = 9;
+	for (i = 1; i < 4; ++i)
+		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
+	/* 64k page size */
+	info->geometries[1].page_shift = 16;
+	for (i = 0; i < 4; ++i)
+		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
+
+	i = 0;
+	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
+
+	return 0;
+}
+
+int kvmppc_init_vm_radix(struct kvm *kvm)
+{
+	kvm->arch.pgtable = pgd_alloc(kvm->mm);
+	if (!kvm->arch.pgtable)
+		return -ENOMEM;
+	return 0;
+}
+
+static void pte_ctor(void *addr)
+{
+	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
+}
+
+struct debugfs_radix_state {
+	struct kvm	*kvm;
+	struct mutex	mutex;
+	unsigned long	gpa;
+	int		lpid;
+	int		chars_left;
+	int		buf_index;
+	char		buf[128];
+	u8		hdr;
+};
+
+static int debugfs_radix_open(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm = inode->i_private;
+	struct debugfs_radix_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(kvm);
+	p->kvm = kvm;
+	mutex_init(&p->mutex);
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_radix_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_radix_state *p = file->private_data;
+
+	kvm_put_kvm(p->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	struct debugfs_radix_state *p = file->private_data;
+	ssize_t ret, r;
+	unsigned long n;
+	struct kvm *kvm;
+	unsigned long gpa;
+	pgd_t *pgt;
+	struct kvm_nested_guest *nested;
+	pgd_t *pgdp;
+	p4d_t p4d, *p4dp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t *ptep;
+	int shift;
+	unsigned long pte;
+
+	kvm = p->kvm;
+	if (!kvm_is_radix(kvm))
+		return 0;
+
+	ret = mutex_lock_interruptible(&p->mutex);
+	if (ret)
+		return ret;
+
+	if (p->chars_left) {
+		n = p->chars_left;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf + p->buf_index, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index += n;
+		buf += n;
+		len -= n;
+		ret = n;
+		if (r) {
+			if (!n)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	gpa = p->gpa;
+	nested = NULL;
+	pgt = NULL;
+	while (len != 0 && p->lpid >= 0) {
+		if (gpa >= RADIX_PGTABLE_RANGE) {
+			gpa = 0;
+			pgt = NULL;
+			if (nested) {
+				kvmhv_put_nested(nested);
+				nested = NULL;
+			}
+			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
+			p->hdr = 0;
+			if (p->lpid < 0)
+				break;
+		}
+		if (!pgt) {
+			if (p->lpid == 0) {
+				pgt = kvm->arch.pgtable;
+			} else {
+				nested = kvmhv_get_nested(kvm, p->lpid, false);
+				if (!nested) {
+					gpa = RADIX_PGTABLE_RANGE;
+					continue;
+				}
+				pgt = nested->shadow_pgtable;
+			}
+		}
+		n = 0;
+		if (!p->hdr) {
+			if (p->lpid > 0)
+				n = scnprintf(p->buf, sizeof(p->buf),
+					      "\nNested LPID %d: ", p->lpid);
+			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
+				      "pgdir: %lx\n", (unsigned long)pgt);
+			p->hdr = 1;
+			goto copy;
+		}
+
+		pgdp = pgt + pgd_index(gpa);
+		p4dp = p4d_offset(pgdp, gpa);
+		p4d = READ_ONCE(*p4dp);
+		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
+			gpa = (gpa & P4D_MASK) + P4D_SIZE;
+			continue;
+		}
+
+		pudp = pud_offset(&p4d, gpa);
+		pud = READ_ONCE(*pudp);
+		if (!(pud_val(pud) & _PAGE_PRESENT)) {
+			gpa = (gpa & PUD_MASK) + PUD_SIZE;
+			continue;
+		}
+		if (pud_val(pud) & _PAGE_PTE) {
+			pte = pud_val(pud);
+			shift = PUD_SHIFT;
+			goto leaf;
+		}
+
+		pmdp = pmd_offset(&pud, gpa);
+		pmd = READ_ONCE(*pmdp);
+		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+			gpa = (gpa & PMD_MASK) + PMD_SIZE;
+			continue;
+		}
+		if (pmd_val(pmd) & _PAGE_PTE) {
+			pte = pmd_val(pmd);
+			shift = PMD_SHIFT;
+			goto leaf;
+		}
+
+		ptep = pte_offset_kernel(&pmd, gpa);
+		pte = pte_val(READ_ONCE(*ptep));
+		if (!(pte & _PAGE_PRESENT)) {
+			gpa += PAGE_SIZE;
+			continue;
+		}
+		shift = PAGE_SHIFT;
+	leaf:
+		n = scnprintf(p->buf, sizeof(p->buf),
+			      " %lx: %lx %d\n", gpa, pte, shift);
+		gpa += 1ul << shift;
+	copy:
+		p->chars_left = n;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index = n;
+		buf += n;
+		len -= n;
+		ret += n;
+		if (r) {
+			if (!ret)
+				ret = -EFAULT;
+			break;
+		}
+	}
+	p->gpa = gpa;
+	if (nested)
+		kvmhv_put_nested(nested);
+
+ out:
+	mutex_unlock(&p->mutex);
+	return ret;
+}
+
+static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_radix_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_radix_open,
+	.release = debugfs_radix_release,
+	.read	 = debugfs_radix_read,
+	.write	 = debugfs_radix_write,
+	.llseek	 = generic_file_llseek,
+};
+
+void kvmhv_radix_debugfs_init(struct kvm *kvm)
+{
+	debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
+			    &debugfs_radix_fops);
+}
+
+int kvmppc_radix_init(void)
+{
+	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
+
+	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
+	if (!kvm_pte_cache)
+		return -ENOMEM;
+
+	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
+
+	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
+	if (!kvm_pmd_cache) {
+		kmem_cache_destroy(kvm_pte_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void kvmppc_radix_exit(void)
+{
+	kmem_cache_destroy(kvm_pte_cache);
+	kmem_cache_destroy(kvm_pmd_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
new file mode 100644
index 0000000000..4d958dd21e
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#define SHADOW_SLB_ENTRY_LEN	0x10
+#define OFFSET_ESID(x)		(SHADOW_SLB_ENTRY_LEN * x)
+#define OFFSET_VSID(x)		((SHADOW_SLB_ENTRY_LEN * x) + 8)
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_GUEST_SEGMENTS
+
+	/* Required state:
+	 *
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * R2 = host R2
+	 * R3 = shadow vcpu
+	 * all other volatile GPRS = free except R4, R6
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 */
+
+BEGIN_FW_FTR_SECTION
+
+	/* Declare SLB shadow as 0 entries big */
+
+	ld	r11, PACA_SLBSHADOWPTR(r13)
+	li	r8, 0
+	stb	r8, 3(r11)
+
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)
+
+	/* Flush SLB */
+
+	li	r10, 0
+	slbmte	r10, r10
+	slbia
+
+	/* Fill SLB with our shadow */
+
+	lbz	r12, SVCPU_SLB_MAX(r3)
+	mulli	r12, r12, 16
+	addi	r12, r12, SVCPU_SLB
+	add	r12, r12, r3
+
+	/* for (r11 = kvm_slb; r11 < kvm_slb + kvm_slb_size; r11+=slb_entry) */
+	li	r11, SVCPU_SLB
+	add	r11, r11, r3
+
+slb_loop_enter:
+
+	ld	r10, 0(r11)
+
+	andis.	r9, r10, SLB_ESID_V@h
+	beq	slb_loop_enter_skip
+
+	ld	r9, 8(r11)
+	slbmte	r9, r10
+
+slb_loop_enter_skip:
+	addi	r11, r11, 16
+	cmpd	cr0, r11, r12
+	blt	slb_loop_enter
+
+slb_do_enter:
+
+.endm
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_HOST_SEGMENTS
+
+	/* Register usage at this point:
+	 *
+	 * R1         = host R1
+	 * R2         = host R2
+	 * R12        = exit handler id
+	 * R13        = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+	 * SVCPU.*    = guest *
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 *
+	 */
+
+	/* Remove all SLB entries that are in use. */
+
+	li	r0, 0
+	slbmte	r0, r0
+	slbia
+
+	/* Restore bolted entries from the shadow */
+
+	ld	r11, PACA_SLBSHADOWPTR(r13)
+
+BEGIN_FW_FTR_SECTION
+
+	/* Declare SLB shadow as SLB_NUM_BOLTED entries big */
+
+	li	r8, SLB_NUM_BOLTED
+	stb	r8, 3(r11)
+
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)
+
+	/* Manually load all entries from shadow SLB */
+
+	li	r8, SLBSHADOW_SAVEAREA
+	li	r7, SLBSHADOW_SAVEAREA + 8
+
+	.rept	SLB_NUM_BOLTED
+	LDX_BE	r10, r11, r8
+	cmpdi	r10, 0
+	beq	1f
+	LDX_BE	r9, r11, r7
+	slbmte	r9, r10
+1:	addi	r7, r7, SHADOW_SLB_ENTRY_LEN
+	addi	r8, r8, SHADOW_SLB_ENTRY_LEN
+	.endr
+
+	isync
+	sync
+
+slb_do_exit:
+
+.endm
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
new file mode 100644
index 0000000000..93b695b289
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -0,0 +1,798 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/book3s/64/mmu-hash.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/mmu_context.h>
+
+static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
+	unsigned long liobn)
+{
+	struct kvmppc_spapr_tce_table *stt;
+
+	list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+		if (stt->liobn == liobn)
+			return stt;
+
+	return NULL;
+}
+
+static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
+{
+	return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
+{
+	unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+			(tce_pages * sizeof(struct page *));
+
+	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
+			struct kvmppc_spapr_tce_iommu_table, rcu);
+
+	iommu_tce_table_put(stit->tbl);
+
+	kfree(stit);
+}
+
+static void kvm_spapr_tce_liobn_put(struct kref *kref)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref,
+			struct kvmppc_spapr_tce_iommu_table, kref);
+
+	list_del_rcu(&stit->next);
+
+	call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free);
+}
+
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp)
+{
+	int i;
+	struct kvmppc_spapr_tce_table *stt;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct iommu_table_group *table_group = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+
+		table_group = iommu_group_get_iommudata(grp);
+		if (WARN_ON(!table_group))
+			continue;
+
+		list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+			for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+				if (table_group->tables[i] != stit->tbl)
+					continue;
+
+				kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
+			}
+		}
+		cond_resched_rcu();
+	}
+	rcu_read_unlock();
+}
+
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	bool found = false;
+	struct iommu_table *tbl = NULL;
+	struct iommu_table_group *table_group;
+	long i;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	struct fd f;
+
+	f = fdget(tablefd);
+	if (!f.file)
+		return -EBADF;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt == f.file->private_data) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	fdput(f);
+
+	if (!found)
+		return -EINVAL;
+
+	table_group = iommu_group_get_iommudata(grp);
+	if (WARN_ON(!table_group))
+		return -EFAULT;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbltmp = table_group->tables[i];
+
+		if (!tbltmp)
+			continue;
+		/* Make sure hardware table parameters are compatible */
+		if ((tbltmp->it_page_shift <= stt->page_shift) &&
+				(tbltmp->it_offset << tbltmp->it_page_shift ==
+				 stt->offset << stt->page_shift) &&
+				(tbltmp->it_size << tbltmp->it_page_shift >=
+				 stt->size << stt->page_shift)) {
+			/*
+			 * Reference the table to avoid races with
+			 * add/remove DMA windows.
+			 */
+			tbl = iommu_tce_table_get(tbltmp);
+			break;
+		}
+	}
+	if (!tbl)
+		return -EINVAL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+		if (tbl != stit->tbl)
+			continue;
+
+		if (!kref_get_unless_zero(&stit->kref)) {
+			/* stit is being destroyed */
+			iommu_tce_table_put(tbl);
+			rcu_read_unlock();
+			return -ENOTTY;
+		}
+		/*
+		 * The table is already known to this KVM, we just increased
+		 * its KVM reference counter and can return.
+		 */
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+
+	stit = kzalloc(sizeof(*stit), GFP_KERNEL);
+	if (!stit) {
+		iommu_tce_table_put(tbl);
+		return -ENOMEM;
+	}
+
+	stit->tbl = tbl;
+	kref_init(&stit->kref);
+
+	list_add_rcu(&stit->next, &stt->iommu_tables);
+
+	return 0;
+}
+
+static void release_spapr_tce_table(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_table *stt = container_of(head,
+			struct kvmppc_spapr_tce_table, rcu);
+	unsigned long i, npages = kvmppc_tce_pages(stt->size);
+
+	for (i = 0; i < npages; i++)
+		if (stt->pages[i])
+			__free_page(stt->pages[i]);
+
+	kfree(stt);
+}
+
+static struct page *kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table *stt,
+		unsigned long sttpage)
+{
+	struct page *page = stt->pages[sttpage];
+
+	if (page)
+		return page;
+
+	mutex_lock(&stt->alloc_lock);
+	page = stt->pages[sttpage];
+	if (!page) {
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		WARN_ON_ONCE(!page);
+		if (page)
+			stt->pages[sttpage] = page;
+	}
+	mutex_unlock(&stt->alloc_lock);
+
+	return page;
+}
+
+static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
+		return VM_FAULT_SIGBUS;
+
+	page = kvm_spapr_get_tce_page(stt, vmf->pgoff);
+	if (!page)
+		return VM_FAULT_OOM;
+
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct kvm *kvm = stt->kvm;
+
+	mutex_lock(&kvm->lock);
+	list_del_rcu(&stt->list);
+	mutex_unlock(&kvm->lock);
+
+	list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+		WARN_ON(!kref_read(&stit->kref));
+		while (1) {
+			if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put))
+				break;
+		}
+	}
+
+	account_locked_vm(kvm->mm,
+		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+
+	kvm_put_kvm(stt->kvm);
+
+	call_rcu(&stt->rcu, release_spapr_tce_table);
+
+	return 0;
+}
+
+static const struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+int kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				  struct kvm_create_spapr_tce_64 *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	struct kvmppc_spapr_tce_table *siter;
+	struct mm_struct *mm = kvm->mm;
+	unsigned long npages;
+	int ret;
+
+	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
+		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
+		return -EINVAL;
+
+	npages = kvmppc_tce_pages(args->size);
+	ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true);
+	if (ret)
+		return ret;
+
+	ret = -ENOMEM;
+	stt = kzalloc(struct_size(stt, pages, npages), GFP_KERNEL | __GFP_NOWARN);
+	if (!stt)
+		goto fail_acct;
+
+	stt->liobn = args->liobn;
+	stt->page_shift = args->page_shift;
+	stt->offset = args->offset;
+	stt->size = args->size;
+	stt->kvm = kvm;
+	mutex_init(&stt->alloc_lock);
+	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
+
+	mutex_lock(&kvm->lock);
+
+	/* Check this LIOBN hasn't been previously allocated */
+	ret = 0;
+	list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
+		if (siter->liobn == args->liobn) {
+			ret = -EBUSY;
+			break;
+		}
+	}
+
+	kvm_get_kvm(kvm);
+	if (!ret)
+		ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				       stt, O_RDWR | O_CLOEXEC);
+
+	if (ret >= 0)
+		list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+	else
+		kvm_put_kvm_no_destroy(kvm);
+
+	mutex_unlock(&kvm->lock);
+
+	if (ret >= 0)
+		return ret;
+
+	kfree(stt);
+ fail_acct:
+	account_locked_vm(mm, kvmppc_stt_pages(npages), false);
+	return ret;
+}
+
+static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
+		unsigned long *ua)
+{
+	unsigned long gfn = tce >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot;
+
+	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+	if (!memslot)
+		return -EINVAL;
+
+	*ua = __gfn_to_hva_memslot(memslot, gfn) |
+		(tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+	return 0;
+}
+
+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
+		unsigned long tce)
+{
+	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	enum dma_data_direction dir = iommu_tce_direction(tce);
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long ua = 0;
+
+	/* Allow userspace to poison TCE table */
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	if (iommu_tce_check_gpa(stt->page_shift, gpa))
+		return H_TOO_HARD;
+
+	if (kvmppc_tce_to_ua(stt->kvm, tce, &ua))
+		return H_TOO_HARD;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+		unsigned long hpa = 0;
+		struct mm_iommu_table_group_mem_t *mem;
+		long shift = stit->tbl->it_page_shift;
+
+		mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
+		if (!mem || mm_iommu_ua_to_hpa(mem, ua, shift, &hpa)) {
+			rcu_read_unlock();
+			return H_TOO_HARD;
+		}
+	}
+	rcu_read_unlock();
+
+	return H_SUCCESS;
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ */
+static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+		unsigned long idx, unsigned long tce)
+{
+	struct page *page;
+	u64 *tbl;
+	unsigned long sttpage;
+
+	idx -= stt->offset;
+	sttpage = idx / TCES_PER_PAGE;
+	page = stt->pages[sttpage];
+
+	if (!page) {
+		/* We allow any TCE, not just with read|write permissions */
+		if (!tce)
+			return;
+
+		page = kvm_spapr_get_tce_page(stt, sttpage);
+		if (!page)
+			return;
+	}
+	tbl = page_to_virt(page);
+
+	tbl[idx % TCES_PER_PAGE] = tce;
+}
+
+static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long i;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
+
+	for (i = 0; i < subpages; ++i) {
+		unsigned long hpa = 0;
+		enum dma_data_direction dir = DMA_NONE;
+
+		iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir);
+	}
+}
+
+static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
+
+	if (!pua)
+		return H_SUCCESS;
+
+	mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = cpu_to_be64(0);
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa,
+					&dir)))
+		return H_TOO_HARD;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret != H_SUCCESS)
+		iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
+		struct kvmppc_spapr_tce_table *stt, struct iommu_table *tbl,
+		unsigned long entry)
+{
+	unsigned long i, ret = H_SUCCESS;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry * subpages;
+
+	for (i = 0; i < subpages; ++i) {
+		ret = kvmppc_tce_iommu_do_unmap(kvm, tbl, io_entry + i);
+		if (ret != H_SUCCESS)
+			break;
+	}
+
+	iommu_tce_kill(tbl, io_entry, subpages);
+
+	return ret;
+}
+
+static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa;
+	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
+		return H_TOO_HARD;
+
+	if (mm_iommu_mapped_inc(mem))
+		return H_TOO_HARD;
+
+	ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
+	if (WARN_ON_ONCE(ret)) {
+		mm_iommu_mapped_dec(mem);
+		return H_TOO_HARD;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = cpu_to_be64(ua);
+
+	return 0;
+}
+
+static long kvmppc_tce_iommu_map(struct kvm *kvm,
+		struct kvmppc_spapr_tce_table *stt, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	unsigned long i, pgoff, ret = H_SUCCESS;
+	unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
+	unsigned long io_entry = entry * subpages;
+
+	for (i = 0, pgoff = 0; i < subpages;
+			++i, pgoff += IOMMU_PAGE_SIZE(tbl)) {
+
+		ret = kvmppc_tce_iommu_do_map(kvm, tbl,
+				io_entry + i, ua + pgoff, dir);
+		if (ret != H_SUCCESS)
+			break;
+	}
+
+	iommu_tce_kill(tbl, io_entry, subpages);
+
+	return ret;
+}
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long ret, idx;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
+
+	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+	/* 	    liobn, ioba, tce); */
+
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvmppc_tce_validate(stt, tce);
+	if (ret != H_SUCCESS)
+		goto unlock_exit;
+
+	dir = iommu_tce_direction(tce);
+
+	if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
+		ret = H_PARAMETER;
+		goto unlock_exit;
+	}
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE)
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm, stt,
+					stit->tbl, entry);
+		else
+			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
+					entry, ua, dir);
+
+
+		if (ret != H_SUCCESS) {
+			kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry);
+			goto unlock_exit;
+		}
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
+
+unlock_exit:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
+
+long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret = H_SUCCESS, idx;
+	unsigned long entry, ua = 0;
+	u64 __user *tces;
+	u64 tce;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	entry = ioba >> stt->page_shift;
+	/*
+	 * SPAPR spec says that the maximum size of the list is 512 TCEs
+	 * so the whole table fits in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & (SZ_4K - 1))
+		return H_PARAMETER;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua)) {
+		ret = H_TOO_HARD;
+		goto unlock_exit;
+	}
+	tces = (u64 __user *) ua;
+
+	for (i = 0; i < npages; ++i) {
+		if (get_user(tce, tces + i)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
+		tce = be64_to_cpu(tce);
+
+		ret = kvmppc_tce_validate(stt, tce);
+		if (ret != H_SUCCESS)
+			goto unlock_exit;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		/*
+		 * This looks unsafe, because we validate, then regrab
+		 * the TCE from userspace which could have been changed by
+		 * another thread.
+		 *
+		 * But it actually is safe, because the relevant checks will be
+		 * re-executed in the following code.  If userspace tries to
+		 * change this dodgily it will result in a messier failure mode
+		 * but won't threaten the host.
+		 */
+		if (get_user(tce, tces + i)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
+		tce = be64_to_cpu(tce);
+
+		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
+			ret = H_PARAMETER;
+			goto unlock_exit;
+		}
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret != H_SUCCESS) {
+				kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl,
+						 entry + i);
+				goto unlock_exit;
+			}
+		}
+
+		kvmppc_tce_put(stt, entry + i, tce);
+	}
+
+unlock_exit:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	/* Check permission bits only to allow userspace poison TCE for debug */
+	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+		return H_PARAMETER;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stt->page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm, stt,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i);
+		}
+	}
+
+	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long ret;
+	unsigned long idx;
+	struct page *page;
+	u64 *tbl;
+
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	idx = (ioba >> stt->page_shift) - stt->offset;
+	page = stt->pages[idx / TCES_PER_PAGE];
+	if (!page) {
+		vcpu->arch.regs.gpr[4] = 0;
+		return H_SUCCESS;
+	}
+	tbl = (u64 *)page_address(page);
+
+	vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
new file mode 100644
index 0000000000..5bbfb2eed1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_book3s.h>
+#include <asm/reg.h>
+#include <asm/switch_to.h>
+#include <asm/time.h>
+#include <asm/tm.h>
+#include "book3s.h"
+#include <asm/asm-prototypes.h>
+
+#define OP_19_XOP_RFID		18
+#define OP_19_XOP_RFI		50
+
+#define OP_31_XOP_MFMSR		83
+#define OP_31_XOP_MTMSR		146
+#define OP_31_XOP_MTMSRD	178
+#define OP_31_XOP_MTSR		210
+#define OP_31_XOP_MTSRIN	242
+#define OP_31_XOP_TLBIEL	274
+/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
+#define OP_31_XOP_FAKE_SC1	308
+#define OP_31_XOP_SLBMTE	402
+#define OP_31_XOP_SLBIE		434
+#define OP_31_XOP_SLBIA		498
+#define OP_31_XOP_MFSR		595
+#define OP_31_XOP_MFSRIN	659
+#define OP_31_XOP_DCBA		758
+#define OP_31_XOP_SLBMFEV	851
+#define OP_31_XOP_EIOIO		854
+#define OP_31_XOP_SLBMFEE	915
+#define OP_31_XOP_SLBFEE	979
+
+#define OP_31_XOP_TBEGIN	654
+#define OP_31_XOP_TABORT	910
+
+#define OP_31_XOP_TRECLAIM	942
+#define OP_31_XOP_TRCHKPT	1006
+
+/* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
+#define OP_31_XOP_DCBZ		1010
+
+#define OP_LFS			48
+#define OP_LFD			50
+#define OP_STFS			52
+#define OP_STFD			54
+
+#define SPRN_GQR0		912
+#define SPRN_GQR1		913
+#define SPRN_GQR2		914
+#define SPRN_GQR3		915
+#define SPRN_GQR4		916
+#define SPRN_GQR5		917
+#define SPRN_GQR6		918
+#define SPRN_GQR7		919
+
+enum priv_level {
+	PRIV_PROBLEM = 0,
+	PRIV_SUPER = 1,
+	PRIV_HYPER = 2,
+};
+
+static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
+{
+	/* PAPR VMs only access supervisor SPRs */
+	if (vcpu->arch.papr_enabled && (level > PRIV_SUPER))
+		return false;
+
+	/* Limit user space to its own small SPR set */
+	if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM)
+		return false;
+
+	return true;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
+{
+	memcpy(&vcpu->arch.gpr_tm[0], &vcpu->arch.regs.gpr[0],
+			sizeof(vcpu->arch.gpr_tm));
+	memcpy(&vcpu->arch.fp_tm, &vcpu->arch.fp,
+			sizeof(struct thread_fp_state));
+	memcpy(&vcpu->arch.vr_tm, &vcpu->arch.vr,
+			sizeof(struct thread_vr_state));
+	vcpu->arch.ppr_tm = vcpu->arch.ppr;
+	vcpu->arch.dscr_tm = vcpu->arch.dscr;
+	vcpu->arch.amr_tm = vcpu->arch.amr;
+	vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
+	vcpu->arch.tar_tm = vcpu->arch.tar;
+	vcpu->arch.lr_tm = vcpu->arch.regs.link;
+	vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
+	vcpu->arch.xer_tm = vcpu->arch.regs.xer;
+	vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
+}
+
+static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
+{
+	memcpy(&vcpu->arch.regs.gpr[0], &vcpu->arch.gpr_tm[0],
+			sizeof(vcpu->arch.regs.gpr));
+	memcpy(&vcpu->arch.fp, &vcpu->arch.fp_tm,
+			sizeof(struct thread_fp_state));
+	memcpy(&vcpu->arch.vr, &vcpu->arch.vr_tm,
+			sizeof(struct thread_vr_state));
+	vcpu->arch.ppr = vcpu->arch.ppr_tm;
+	vcpu->arch.dscr = vcpu->arch.dscr_tm;
+	vcpu->arch.amr = vcpu->arch.amr_tm;
+	vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
+	vcpu->arch.tar = vcpu->arch.tar_tm;
+	vcpu->arch.regs.link = vcpu->arch.lr_tm;
+	vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
+	vcpu->arch.regs.xer = vcpu->arch.xer_tm;
+	vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
+}
+
+static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
+{
+	unsigned long guest_msr = kvmppc_get_msr(vcpu);
+	int fc_val = ra_val ? ra_val : 1;
+	uint64_t texasr;
+
+	/* CR0 = 0 | MSR[TS] | 0 */
+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
+		(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
+		 << CR0_SHIFT);
+
+	preempt_disable();
+	tm_enable();
+	texasr = mfspr(SPRN_TEXASR);
+	kvmppc_save_tm_pr(vcpu);
+	kvmppc_copyfrom_vcpu_tm(vcpu);
+
+	/* failure recording depends on Failure Summary bit */
+	if (!(texasr & TEXASR_FS)) {
+		texasr &= ~TEXASR_FC;
+		texasr |= ((u64)fc_val << TEXASR_FC_LG) | TEXASR_FS;
+
+		texasr &= ~(TEXASR_PR | TEXASR_HV);
+		if (kvmppc_get_msr(vcpu) & MSR_PR)
+			texasr |= TEXASR_PR;
+
+		if (kvmppc_get_msr(vcpu) & MSR_HV)
+			texasr |= TEXASR_HV;
+
+		vcpu->arch.texasr = texasr;
+		vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+		mtspr(SPRN_TEXASR, texasr);
+		mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+	}
+	tm_disable();
+	/*
+	 * treclaim need quit to non-transactional state.
+	 */
+	guest_msr &= ~(MSR_TS_MASK);
+	kvmppc_set_msr(vcpu, guest_msr);
+	preempt_enable();
+
+	if (vcpu->arch.shadow_fscr & FSCR_TAR)
+		mtspr(SPRN_TAR, vcpu->arch.tar);
+}
+
+static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
+{
+	unsigned long guest_msr = kvmppc_get_msr(vcpu);
+
+	preempt_disable();
+	/*
+	 * need flush FP/VEC/VSX to vcpu save area before
+	 * copy.
+	 */
+	kvmppc_giveup_ext(vcpu, MSR_VSX);
+	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+	kvmppc_copyto_vcpu_tm(vcpu);
+	kvmppc_save_tm_sprs(vcpu);
+
+	/*
+	 * as a result of trecheckpoint. set TS to suspended.
+	 */
+	guest_msr &= ~(MSR_TS_MASK);
+	guest_msr |= MSR_TS_S;
+	kvmppc_set_msr(vcpu, guest_msr);
+	kvmppc_restore_tm_pr(vcpu);
+	preempt_enable();
+}
+
+/* emulate tabort. at guest privilege state */
+void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
+{
+	/* currently we only emulate tabort. but no emulation of other
+	 * tabort variants since there is no kernel usage of them at
+	 * present.
+	 */
+	unsigned long guest_msr = kvmppc_get_msr(vcpu);
+	uint64_t org_texasr;
+
+	preempt_disable();
+	tm_enable();
+	org_texasr = mfspr(SPRN_TEXASR);
+	tm_abort(ra_val);
+
+	/* CR0 = 0 | MSR[TS] | 0 */
+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
+		(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
+		 << CR0_SHIFT);
+
+	vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+	/* failure recording depends on Failure Summary bit,
+	 * and tabort will be treated as nops in non-transactional
+	 * state.
+	 */
+	if (!(org_texasr & TEXASR_FS) &&
+			MSR_TM_ACTIVE(guest_msr)) {
+		vcpu->arch.texasr &= ~(TEXASR_PR | TEXASR_HV);
+		if (guest_msr & MSR_PR)
+			vcpu->arch.texasr |= TEXASR_PR;
+
+		if (guest_msr & MSR_HV)
+			vcpu->arch.texasr |= TEXASR_HV;
+
+		vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+	}
+	tm_disable();
+	preempt_enable();
+}
+
+#endif
+
+int kvmppc_core_emulate_op_pr(struct kvm_vcpu *vcpu,
+			      unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int rt = get_rt(inst);
+	int rs = get_rs(inst);
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
+	u32 inst_sc = 0x44000002;
+
+	switch (get_op(inst)) {
+	case 0:
+		emulated = EMULATE_FAIL;
+		if ((kvmppc_get_msr(vcpu) & MSR_LE) &&
+		    (inst == swab32(inst_sc))) {
+			/*
+			 * This is the byte reversed syscall instruction of our
+			 * hypercall handler. Early versions of LE Linux didn't
+			 * swap the instructions correctly and ended up in
+			 * illegal instructions.
+			 * Just always fail hypercalls on these broken systems.
+			 */
+			kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED);
+			kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+			emulated = EMULATE_DONE;
+		}
+		break;
+	case 19:
+		switch (get_xop(inst)) {
+		case OP_19_XOP_RFID:
+		case OP_19_XOP_RFI: {
+			unsigned long srr1 = kvmppc_get_srr1(vcpu);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+			unsigned long cur_msr = kvmppc_get_msr(vcpu);
+
+			/*
+			 * add rules to fit in ISA specification regarding TM
+			 * state transition in TM disable/Suspended state,
+			 * and target TM state is TM inactive(00) state. (the
+			 * change should be suppressed).
+			 */
+			if (((cur_msr & MSR_TM) == 0) &&
+				((srr1 & MSR_TM) == 0) &&
+				MSR_TM_SUSPENDED(cur_msr) &&
+				!MSR_TM_ACTIVE(srr1))
+				srr1 |= MSR_TS_S;
+#endif
+			kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu));
+			kvmppc_set_msr(vcpu, srr1);
+			*advance = 0;
+			break;
+		}
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+	case 31:
+		switch (get_xop(inst)) {
+		case OP_31_XOP_MFMSR:
+			kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu));
+			break;
+		case OP_31_XOP_MTMSRD:
+		{
+			ulong rs_val = kvmppc_get_gpr(vcpu, rs);
+			if (inst & 0x10000) {
+				ulong new_msr = kvmppc_get_msr(vcpu);
+				new_msr &= ~(MSR_RI | MSR_EE);
+				new_msr |= rs_val & (MSR_RI | MSR_EE);
+				kvmppc_set_msr_fast(vcpu, new_msr);
+			} else
+				kvmppc_set_msr(vcpu, rs_val);
+			break;
+		}
+		case OP_31_XOP_MTMSR:
+			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
+			break;
+		case OP_31_XOP_MFSR:
+		{
+			int srnum;
+
+			srnum = kvmppc_get_field(inst, 12 + 32, 15 + 32);
+			if (vcpu->arch.mmu.mfsrin) {
+				u32 sr;
+				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
+				kvmppc_set_gpr(vcpu, rt, sr);
+			}
+			break;
+		}
+		case OP_31_XOP_MFSRIN:
+		{
+			int srnum;
+
+			srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf;
+			if (vcpu->arch.mmu.mfsrin) {
+				u32 sr;
+				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
+				kvmppc_set_gpr(vcpu, rt, sr);
+			}
+			break;
+		}
+		case OP_31_XOP_MTSR:
+			vcpu->arch.mmu.mtsrin(vcpu,
+				(inst >> 16) & 0xf,
+				kvmppc_get_gpr(vcpu, rs));
+			break;
+		case OP_31_XOP_MTSRIN:
+			vcpu->arch.mmu.mtsrin(vcpu,
+				(kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf,
+				kvmppc_get_gpr(vcpu, rs));
+			break;
+		case OP_31_XOP_TLBIE:
+		case OP_31_XOP_TLBIEL:
+		{
+			bool large = (inst & 0x00200000) ? true : false;
+			ulong addr = kvmppc_get_gpr(vcpu, rb);
+			vcpu->arch.mmu.tlbie(vcpu, addr, large);
+			break;
+		}
+#ifdef CONFIG_PPC_BOOK3S_64
+		case OP_31_XOP_FAKE_SC1:
+		{
+			/* SC 1 papr hypercalls */
+			ulong cmd = kvmppc_get_gpr(vcpu, 3);
+			int i;
+
+		        if ((kvmppc_get_msr(vcpu) & MSR_PR) ||
+			    !vcpu->arch.papr_enabled) {
+				emulated = EMULATE_FAIL;
+				break;
+			}
+
+			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE)
+				break;
+
+			vcpu->run->papr_hcall.nr = cmd;
+			for (i = 0; i < 9; ++i) {
+				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+				vcpu->run->papr_hcall.args[i] = gpr;
+			}
+
+			vcpu->run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->arch.hcall_needed = 1;
+			emulated = EMULATE_EXIT_USER;
+			break;
+		}
+#endif
+		case OP_31_XOP_EIOIO:
+			break;
+		case OP_31_XOP_SLBMTE:
+			if (!vcpu->arch.mmu.slbmte)
+				return EMULATE_FAIL;
+
+			vcpu->arch.mmu.slbmte(vcpu,
+					kvmppc_get_gpr(vcpu, rs),
+					kvmppc_get_gpr(vcpu, rb));
+			break;
+		case OP_31_XOP_SLBIE:
+			if (!vcpu->arch.mmu.slbie)
+				return EMULATE_FAIL;
+
+			vcpu->arch.mmu.slbie(vcpu,
+					kvmppc_get_gpr(vcpu, rb));
+			break;
+		case OP_31_XOP_SLBIA:
+			if (!vcpu->arch.mmu.slbia)
+				return EMULATE_FAIL;
+
+			vcpu->arch.mmu.slbia(vcpu);
+			break;
+		case OP_31_XOP_SLBFEE:
+			if (!(inst & 1) || !vcpu->arch.mmu.slbfee) {
+				return EMULATE_FAIL;
+			} else {
+				ulong b, t;
+				ulong cr = kvmppc_get_cr(vcpu) & ~CR0_MASK;
+
+				b = kvmppc_get_gpr(vcpu, rb);
+				if (!vcpu->arch.mmu.slbfee(vcpu, b, &t))
+					cr |= 2 << CR0_SHIFT;
+				kvmppc_set_gpr(vcpu, rt, t);
+				/* copy XER[SO] bit to CR0[SO] */
+				cr |= (vcpu->arch.regs.xer & 0x80000000) >>
+					(31 - CR0_SHIFT);
+				kvmppc_set_cr(vcpu, cr);
+			}
+			break;
+		case OP_31_XOP_SLBMFEE:
+			if (!vcpu->arch.mmu.slbmfee) {
+				emulated = EMULATE_FAIL;
+			} else {
+				ulong t, rb_val;
+
+				rb_val = kvmppc_get_gpr(vcpu, rb);
+				t = vcpu->arch.mmu.slbmfee(vcpu, rb_val);
+				kvmppc_set_gpr(vcpu, rt, t);
+			}
+			break;
+		case OP_31_XOP_SLBMFEV:
+			if (!vcpu->arch.mmu.slbmfev) {
+				emulated = EMULATE_FAIL;
+			} else {
+				ulong t, rb_val;
+
+				rb_val = kvmppc_get_gpr(vcpu, rb);
+				t = vcpu->arch.mmu.slbmfev(vcpu, rb_val);
+				kvmppc_set_gpr(vcpu, rt, t);
+			}
+			break;
+		case OP_31_XOP_DCBA:
+			/* Gets treated as NOP */
+			break;
+		case OP_31_XOP_DCBZ:
+		{
+			ulong rb_val = kvmppc_get_gpr(vcpu, rb);
+			ulong ra_val = 0;
+			ulong addr, vaddr;
+			u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+			u32 dsisr;
+			int r;
+
+			if (ra)
+				ra_val = kvmppc_get_gpr(vcpu, ra);
+
+			addr = (ra_val + rb_val) & ~31ULL;
+			if (!(kvmppc_get_msr(vcpu) & MSR_SF))
+				addr &= 0xffffffff;
+			vaddr = addr;
+
+			r = kvmppc_st(vcpu, &addr, 32, zeros, true);
+			if ((r == -ENOENT) || (r == -EPERM)) {
+				*advance = 0;
+				kvmppc_set_dar(vcpu, vaddr);
+				vcpu->arch.fault_dar = vaddr;
+
+				dsisr = DSISR_ISSTORE;
+				if (r == -ENOENT)
+					dsisr |= DSISR_NOHPTE;
+				else if (r == -EPERM)
+					dsisr |= DSISR_PROTFAULT;
+
+				kvmppc_set_dsisr(vcpu, dsisr);
+				vcpu->arch.fault_dsisr = dsisr;
+
+				kvmppc_book3s_queue_irqprio(vcpu,
+					BOOK3S_INTERRUPT_DATA_STORAGE);
+			}
+
+			break;
+		}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+		case OP_31_XOP_TBEGIN:
+		{
+			if (!cpu_has_feature(CPU_FTR_TM))
+				break;
+
+			if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+				kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
+				preempt_disable();
+				vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
+				  (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
+
+				vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
+					(((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
+						 << TEXASR_FC_LG));
+
+				if ((inst >> 21) & 0x1)
+					vcpu->arch.texasr |= TEXASR_ROT;
+
+				if (kvmppc_get_msr(vcpu) & MSR_HV)
+					vcpu->arch.texasr |= TEXASR_HV;
+
+				vcpu->arch.tfhar = kvmppc_get_pc(vcpu) + 4;
+				vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+
+				kvmppc_restore_tm_sprs(vcpu);
+				preempt_enable();
+			} else
+				emulated = EMULATE_FAIL;
+			break;
+		}
+		case OP_31_XOP_TABORT:
+		{
+			ulong guest_msr = kvmppc_get_msr(vcpu);
+			unsigned long ra_val = 0;
+
+			if (!cpu_has_feature(CPU_FTR_TM))
+				break;
+
+			if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+				kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			/* only emulate for privilege guest, since problem state
+			 * guest can run with TM enabled and we don't expect to
+			 * trap at here for that case.
+			 */
+			WARN_ON(guest_msr & MSR_PR);
+
+			if (ra)
+				ra_val = kvmppc_get_gpr(vcpu, ra);
+
+			kvmppc_emulate_tabort(vcpu, ra_val);
+			break;
+		}
+		case OP_31_XOP_TRECLAIM:
+		{
+			ulong guest_msr = kvmppc_get_msr(vcpu);
+			unsigned long ra_val = 0;
+
+			if (!cpu_has_feature(CPU_FTR_TM))
+				break;
+
+			if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+				kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			/* generate interrupts based on priorities */
+			if (guest_msr & MSR_PR) {
+				/* Privileged Instruction type Program Interrupt */
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			if (!MSR_TM_ACTIVE(guest_msr)) {
+				/* TM bad thing interrupt */
+				kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			if (ra)
+				ra_val = kvmppc_get_gpr(vcpu, ra);
+			kvmppc_emulate_treclaim(vcpu, ra_val);
+			break;
+		}
+		case OP_31_XOP_TRCHKPT:
+		{
+			ulong guest_msr = kvmppc_get_msr(vcpu);
+			unsigned long texasr;
+
+			if (!cpu_has_feature(CPU_FTR_TM))
+				break;
+
+			if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+				kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			/* generate interrupt based on priorities */
+			if (guest_msr & MSR_PR) {
+				/* Privileged Instruction type Program Intr */
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			tm_enable();
+			texasr = mfspr(SPRN_TEXASR);
+			tm_disable();
+
+			if (MSR_TM_ACTIVE(guest_msr) ||
+				!(texasr & (TEXASR_FS))) {
+				/* TM bad thing interrupt */
+				kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+				emulated = EMULATE_AGAIN;
+				break;
+			}
+
+			kvmppc_emulate_trchkpt(vcpu);
+			break;
+		}
+#endif
+		default:
+			emulated = EMULATE_FAIL;
+		}
+		break;
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL)
+		emulated = kvmppc_emulate_paired_single(vcpu);
+
+	return emulated;
+}
+
+void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
+                    u32 val)
+{
+	if (upper) {
+		/* Upper BAT */
+		u32 bl = (val >> 2) & 0x7ff;
+		bat->bepi_mask = (~bl << 17);
+		bat->bepi = val & 0xfffe0000;
+		bat->vs = (val & 2) ? 1 : 0;
+		bat->vp = (val & 1) ? 1 : 0;
+		bat->raw = (bat->raw & 0xffffffff00000000ULL) | val;
+	} else {
+		/* Lower BAT */
+		bat->brpn = val & 0xfffe0000;
+		bat->wimg = (val >> 3) & 0xf;
+		bat->pp = val & 3;
+		bat->raw = (bat->raw & 0x00000000ffffffffULL) | ((u64)val << 32);
+	}
+}
+
+static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	struct kvmppc_bat *bat;
+
+	switch (sprn) {
+	case SPRN_IBAT0U ... SPRN_IBAT3L:
+		bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
+		break;
+	case SPRN_IBAT4U ... SPRN_IBAT7L:
+		bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
+		break;
+	case SPRN_DBAT0U ... SPRN_DBAT3L:
+		bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
+		break;
+	case SPRN_DBAT4U ... SPRN_DBAT7L:
+		bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
+		break;
+	default:
+		BUG();
+	}
+
+	return bat;
+}
+
+int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_SDR1:
+		if (!spr_allowed(vcpu, PRIV_HYPER))
+			goto unprivileged;
+		to_book3s(vcpu)->sdr1 = spr_val;
+		break;
+	case SPRN_DSISR:
+		kvmppc_set_dsisr(vcpu, spr_val);
+		break;
+	case SPRN_DAR:
+		kvmppc_set_dar(vcpu, spr_val);
+		break;
+	case SPRN_HIOR:
+		to_book3s(vcpu)->hior = spr_val;
+		break;
+	case SPRN_IBAT0U ... SPRN_IBAT3L:
+	case SPRN_IBAT4U ... SPRN_IBAT7L:
+	case SPRN_DBAT0U ... SPRN_DBAT3L:
+	case SPRN_DBAT4U ... SPRN_DBAT7L:
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val);
+		/* BAT writes happen so rarely that we're ok to flush
+		 * everything here */
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+		break;
+	}
+	case SPRN_HID0:
+		to_book3s(vcpu)->hid[0] = spr_val;
+		break;
+	case SPRN_HID1:
+		to_book3s(vcpu)->hid[1] = spr_val;
+		break;
+	case SPRN_HID2:
+		to_book3s(vcpu)->hid[2] = spr_val;
+		break;
+	case SPRN_HID2_GEKKO:
+		to_book3s(vcpu)->hid[2] = spr_val;
+		/* HID2.PSE controls paired single on gekko */
+		switch (vcpu->arch.pvr) {
+		case 0x00080200:	/* lonestar 2.0 */
+		case 0x00088202:	/* lonestar 2.2 */
+		case 0x70000100:	/* gekko 1.0 */
+		case 0x00080100:	/* gekko 2.0 */
+		case 0x00083203:	/* gekko 2.3a */
+		case 0x00083213:	/* gekko 2.3b */
+		case 0x00083204:	/* gekko 2.4 */
+		case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
+		case 0x00087200:	/* broadway */
+			if (vcpu->arch.hflags & BOOK3S_HFLAG_NATIVE_PS) {
+				/* Native paired singles */
+			} else if (spr_val & (1 << 29)) { /* HID2.PSE */
+				vcpu->arch.hflags |= BOOK3S_HFLAG_PAIRED_SINGLE;
+				kvmppc_giveup_ext(vcpu, MSR_FP);
+			} else {
+				vcpu->arch.hflags &= ~BOOK3S_HFLAG_PAIRED_SINGLE;
+			}
+			break;
+		}
+		break;
+	case SPRN_HID4:
+	case SPRN_HID4_GEKKO:
+		to_book3s(vcpu)->hid[4] = spr_val;
+		break;
+	case SPRN_HID5:
+		to_book3s(vcpu)->hid[5] = spr_val;
+		/* guest HID5 set can change is_dcbz32 */
+		if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+		    (mfmsr() & MSR_HV))
+			vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+		break;
+	case SPRN_GQR0:
+	case SPRN_GQR1:
+	case SPRN_GQR2:
+	case SPRN_GQR3:
+	case SPRN_GQR4:
+	case SPRN_GQR5:
+	case SPRN_GQR6:
+	case SPRN_GQR7:
+		to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val;
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_FSCR:
+		kvmppc_set_fscr(vcpu, spr_val);
+		break;
+	case SPRN_BESCR:
+		vcpu->arch.bescr = spr_val;
+		break;
+	case SPRN_EBBHR:
+		vcpu->arch.ebbhr = spr_val;
+		break;
+	case SPRN_EBBRR:
+		vcpu->arch.ebbrr = spr_val;
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case SPRN_TFHAR:
+	case SPRN_TEXASR:
+	case SPRN_TFIAR:
+		if (!cpu_has_feature(CPU_FTR_TM))
+			break;
+
+		if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+			kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+			emulated = EMULATE_AGAIN;
+			break;
+		}
+
+		if (MSR_TM_ACTIVE(kvmppc_get_msr(vcpu)) &&
+			!((MSR_TM_SUSPENDED(kvmppc_get_msr(vcpu))) &&
+					(sprn == SPRN_TFHAR))) {
+			/* it is illegal to mtspr() TM regs in
+			 * other than non-transactional state, with
+			 * the exception of TFHAR in suspend state.
+			 */
+			kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+			emulated = EMULATE_AGAIN;
+			break;
+		}
+
+		tm_enable();
+		if (sprn == SPRN_TFHAR)
+			mtspr(SPRN_TFHAR, spr_val);
+		else if (sprn == SPRN_TEXASR)
+			mtspr(SPRN_TEXASR, spr_val);
+		else
+			mtspr(SPRN_TFIAR, spr_val);
+		tm_disable();
+
+		break;
+#endif
+#endif
+	case SPRN_ICTC:
+	case SPRN_THRM1:
+	case SPRN_THRM2:
+	case SPRN_THRM3:
+	case SPRN_CTRLF:
+	case SPRN_CTRLT:
+	case SPRN_L2CR:
+	case SPRN_DSCR:
+	case SPRN_MMCR0_GEKKO:
+	case SPRN_MMCR1_GEKKO:
+	case SPRN_PMC1_GEKKO:
+	case SPRN_PMC2_GEKKO:
+	case SPRN_PMC3_GEKKO:
+	case SPRN_PMC4_GEKKO:
+	case SPRN_WPAR_GEKKO:
+	case SPRN_MSSSR0:
+	case SPRN_DABR:
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_MMCRS:
+	case SPRN_MMCRA:
+	case SPRN_MMCR0:
+	case SPRN_MMCR1:
+	case SPRN_MMCR2:
+	case SPRN_UMMCR2:
+	case SPRN_UAMOR:
+	case SPRN_IAMR:
+	case SPRN_AMR:
+#endif
+		break;
+unprivileged:
+	default:
+		pr_info_ratelimited("KVM: invalid SPR write: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
+		break;
+	}
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_IBAT0U ... SPRN_IBAT3L:
+	case SPRN_IBAT4U ... SPRN_IBAT7L:
+	case SPRN_DBAT0U ... SPRN_DBAT3L:
+	case SPRN_DBAT4U ... SPRN_DBAT7L:
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		if (sprn % 2)
+			*spr_val = bat->raw >> 32;
+		else
+			*spr_val = bat->raw;
+
+		break;
+	}
+	case SPRN_SDR1:
+		if (!spr_allowed(vcpu, PRIV_HYPER))
+			goto unprivileged;
+		*spr_val = to_book3s(vcpu)->sdr1;
+		break;
+	case SPRN_DSISR:
+		*spr_val = kvmppc_get_dsisr(vcpu);
+		break;
+	case SPRN_DAR:
+		*spr_val = kvmppc_get_dar(vcpu);
+		break;
+	case SPRN_HIOR:
+		*spr_val = to_book3s(vcpu)->hior;
+		break;
+	case SPRN_HID0:
+		*spr_val = to_book3s(vcpu)->hid[0];
+		break;
+	case SPRN_HID1:
+		*spr_val = to_book3s(vcpu)->hid[1];
+		break;
+	case SPRN_HID2:
+	case SPRN_HID2_GEKKO:
+		*spr_val = to_book3s(vcpu)->hid[2];
+		break;
+	case SPRN_HID4:
+	case SPRN_HID4_GEKKO:
+		*spr_val = to_book3s(vcpu)->hid[4];
+		break;
+	case SPRN_HID5:
+		*spr_val = to_book3s(vcpu)->hid[5];
+		break;
+	case SPRN_CFAR:
+	case SPRN_DSCR:
+		*spr_val = 0;
+		break;
+	case SPRN_PURR:
+		/*
+		 * On exit we would have updated purr
+		 */
+		*spr_val = vcpu->arch.purr;
+		break;
+	case SPRN_SPURR:
+		/*
+		 * On exit we would have updated spurr
+		 */
+		*spr_val = vcpu->arch.spurr;
+		break;
+	case SPRN_VTB:
+		*spr_val = to_book3s(vcpu)->vtb;
+		break;
+	case SPRN_IC:
+		*spr_val = vcpu->arch.ic;
+		break;
+	case SPRN_GQR0:
+	case SPRN_GQR1:
+	case SPRN_GQR2:
+	case SPRN_GQR3:
+	case SPRN_GQR4:
+	case SPRN_GQR5:
+	case SPRN_GQR6:
+	case SPRN_GQR7:
+		*spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0];
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_FSCR:
+		*spr_val = vcpu->arch.fscr;
+		break;
+	case SPRN_BESCR:
+		*spr_val = vcpu->arch.bescr;
+		break;
+	case SPRN_EBBHR:
+		*spr_val = vcpu->arch.ebbhr;
+		break;
+	case SPRN_EBBRR:
+		*spr_val = vcpu->arch.ebbrr;
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case SPRN_TFHAR:
+	case SPRN_TEXASR:
+	case SPRN_TFIAR:
+		if (!cpu_has_feature(CPU_FTR_TM))
+			break;
+
+		if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+			kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+			emulated = EMULATE_AGAIN;
+			break;
+		}
+
+		tm_enable();
+		if (sprn == SPRN_TFHAR)
+			*spr_val = mfspr(SPRN_TFHAR);
+		else if (sprn == SPRN_TEXASR)
+			*spr_val = mfspr(SPRN_TEXASR);
+		else if (sprn == SPRN_TFIAR)
+			*spr_val = mfspr(SPRN_TFIAR);
+		tm_disable();
+		break;
+#endif
+#endif
+	case SPRN_THRM1:
+	case SPRN_THRM2:
+	case SPRN_THRM3:
+	case SPRN_CTRLF:
+	case SPRN_CTRLT:
+	case SPRN_L2CR:
+	case SPRN_MMCR0_GEKKO:
+	case SPRN_MMCR1_GEKKO:
+	case SPRN_PMC1_GEKKO:
+	case SPRN_PMC2_GEKKO:
+	case SPRN_PMC3_GEKKO:
+	case SPRN_PMC4_GEKKO:
+	case SPRN_WPAR_GEKKO:
+	case SPRN_MSSSR0:
+	case SPRN_DABR:
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_MMCRS:
+	case SPRN_MMCRA:
+	case SPRN_MMCR0:
+	case SPRN_MMCR1:
+	case SPRN_MMCR2:
+	case SPRN_UMMCR2:
+	case SPRN_TIR:
+	case SPRN_UAMOR:
+	case SPRN_IAMR:
+	case SPRN_AMR:
+#endif
+		*spr_val = 0;
+		break;
+	default:
+unprivileged:
+		pr_info_ratelimited("KVM: invalid SPR read: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0 ||
+			    sprn == 4 || sprn == 5 || sprn == 6) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
+
+		break;
+	}
+
+	return emulated;
+}
+
+u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
+{
+	return make_dsisr(inst);
+}
+
+ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * Linux's fix_alignment() assumes that DAR is valid, so can we
+	 */
+	return vcpu->arch.fault_dar;
+#else
+	ulong dar = 0;
+	ulong ra = get_ra(inst);
+	ulong rb = get_rb(inst);
+
+	switch (get_op(inst)) {
+	case OP_LFS:
+	case OP_LFD:
+	case OP_STFD:
+	case OP_STFS:
+		if (ra)
+			dar = kvmppc_get_gpr(vcpu, ra);
+		dar += (s32)((s16)inst);
+		break;
+	case 31:
+		if (ra)
+			dar = kvmppc_get_gpr(vcpu, ra);
+		dar += kvmppc_get_gpr(vcpu, rb);
+		break;
+	default:
+		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
+		break;
+	}
+
+	return dar;
+#endif
+}
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
new file mode 100644
index 0000000000..f08565885d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/export.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
+#endif
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
+#endif
+
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 0000000000..0429488ba1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,6360 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/stat.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <linux/srcu.h>
+#include <linux/miscdevice.h>
+#include <linux/debugfs.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/of.h>
+#include <linux/irqdomain.h>
+#include <linux/smp.h>
+
+#include <asm/ftrace.h>
+#include <asm/reg.h>
+#include <asm/ppc-opcode.h>
+#include <asm/asm-prototypes.h>
+#include <asm/archrandom.h>
+#include <asm/debug.h>
+#include <asm/disassemble.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <linux/uaccess.h>
+#include <asm/interrupt.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/pmc.h>
+#include <asm/processor.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+#include <asm/switch_to.h>
+#include <asm/smp.h>
+#include <asm/dbell.h>
+#include <asm/hmi.h>
+#include <asm/pnv-pci.h>
+#include <asm/mmu.h>
+#include <asm/opal.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/kvm_book3s_uvmem.h>
+#include <asm/ultravisor.h>
+#include <asm/dtl.h>
+#include <asm/plpar_wrappers.h>
+
+#include <trace/events/ipi.h>
+
+#include "book3s.h"
+#include "book3s_hv.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_hv.h"
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+/* Used to indicate that a guest page fault needs to be handled */
+#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
+
+/* Used as a "null" value for timebase values */
+#define TB_NIL	(~(u64)0)
+
+static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
+
+static int dynamic_mt_modes = 6;
+module_param(dynamic_mt_modes, int, 0644);
+MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
+static int target_smt_mode;
+module_param(target_smt_mode, int, 0644);
+MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
+
+static bool one_vm_per_core;
+module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
+
+#ifdef CONFIG_KVM_XICS
+static const struct kernel_param_ops module_param_ops = {
+	.set = param_set_int,
+	.get = param_get_int,
+};
+
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, 0644);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
+module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
+MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+#endif
+
+/* If set, guests are allowed to create and control nested guests */
+static bool nested = true;
+module_param(nested, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
+
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+
+/*
+ * RWMR values for POWER8.  These control the rate at which PURR
+ * and SPURR count and should be set according to the number of
+ * online threads in the vcore being run.
+ */
+#define RWMR_RPA_P8_1THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9UL
+#define RWMR_RPA_P8_3THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9UL
+#define RWMR_RPA_P8_5THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_6THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_7THREAD	0x164520C62609AECAUL
+#define RWMR_RPA_P8_8THREAD	0x164520C62609AECAUL
+
+static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
+	RWMR_RPA_P8_1THREAD,
+	RWMR_RPA_P8_1THREAD,
+	RWMR_RPA_P8_2THREAD,
+	RWMR_RPA_P8_3THREAD,
+	RWMR_RPA_P8_4THREAD,
+	RWMR_RPA_P8_5THREAD,
+	RWMR_RPA_P8_6THREAD,
+	RWMR_RPA_P8_7THREAD,
+	RWMR_RPA_P8_8THREAD,
+};
+
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+		int *ip)
+{
+	int i = *ip;
+	struct kvm_vcpu *vcpu;
+
+	while (++i < MAX_SMT_THREADS) {
+		vcpu = READ_ONCE(vc->runnable_threads[i]);
+		if (vcpu) {
+			*ip = i;
+			return vcpu;
+		}
+	}
+	return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
+static bool kvmppc_ipi_thread(int cpu)
+{
+	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+
+	/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
+	if (kvmhv_on_pseries())
+		return false;
+
+	/* On POWER9 we can use msgsnd to IPI any cpu */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		msg |= get_hard_smp_processor_id(cpu);
+		smp_mb();
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return true;
+	}
+
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		preempt_disable();
+		if (cpu_first_thread_sibling(cpu) ==
+		    cpu_first_thread_sibling(smp_processor_id())) {
+			msg |= cpu_thread_in_core(cpu);
+			smp_mb();
+			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+			preempt_enable();
+			return true;
+		}
+		preempt_enable();
+	}
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+	if (cpu >= 0 && cpu < nr_cpu_ids) {
+		if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
+			xics_wake_cpu(cpu);
+			return true;
+		}
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
+static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+	struct rcuwait *waitp;
+
+	/*
+	 * rcuwait_wake_up contains smp_mb() which orders prior stores that
+	 * create pending work vs below loads of cpu fields. The other side
+	 * is the barrier in vcpu run that orders setting the cpu fields vs
+	 * testing for pending work.
+	 */
+
+	waitp = kvm_arch_vcpu_get_wait(vcpu);
+	if (rcuwait_wake_up(waitp))
+		++vcpu->stat.generic.halt_wakeup;
+
+	cpu = READ_ONCE(vcpu->arch.thread_cpu);
+	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
+		return;
+
+	/* CPU points to the first thread of the core */
+	cpu = vcpu->cpu;
+	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+		smp_send_reschedule(cpu);
+}
+
+/*
+ * We use the vcpu_load/put functions to measure stolen time.
+ *
+ * Stolen time is counted as time when either the vcpu is able to
+ * run as part of a virtual core, but the task running the vcore
+ * is preempted or sleeping, or when the vcpu needs something done
+ * in the kernel by the task running the vcpu, but that task is
+ * preempted or sleeping.  Those two things have to be counted
+ * separately, since one of the vcpu tasks will take on the job
+ * of running the core, and the other vcpu tasks in the vcore will
+ * sleep waiting for it to do that, but that sleep shouldn't count
+ * as stolen time.
+ *
+ * Hence we accumulate stolen time when the vcpu can run as part of
+ * a vcore using vc->stolen_tb, and the stolen time when the vcpu
+ * needs its task to do other things in the kernel (for example,
+ * service a page fault) in busy_stolen.  We don't accumulate
+ * stolen time for a vcore when it is inactive, or for a vcpu
+ * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
+ * a misnomer; it means that the vcpu task is not executing in
+ * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
+ * the kernel.  We don't have any way of dividing up that time
+ * between time that the vcpu is genuinely stopped, time that
+ * the task is actively working on behalf of the vcpu, and time
+ * that the task is preempted, so we don't count any of it as
+ * stolen.
+ *
+ * Updates to busy_stolen are protected by arch.tbacct_lock;
+ * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
+ * lock.  The stolen times are measured in units of timebase ticks.
+ * (Note that the != TB_NIL checks below are purely defensive;
+ * they should never fail.)
+ *
+ * The POWER9 path is simpler, one vcpu per virtual core so the
+ * former case does not exist. If a vcpu is preempted when it is
+ * BUSY_IN_HOST and not ceded or otherwise blocked, then accumulate
+ * the stolen cycles in busy_stolen. RUNNING is not a preemptible
+ * state in the P9 path.
+ */
+
+static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	spin_lock_irqsave(&vc->stoltb_lock, flags);
+	vc->preempt_tb = tb;
+	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
+static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	spin_lock_irqsave(&vc->stoltb_lock, flags);
+	if (vc->preempt_tb != TB_NIL) {
+		vc->stolen_tb += tb - vc->preempt_tb;
+		vc->preempt_tb = TB_NIL;
+	}
+	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
+static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	unsigned long flags;
+	u64 now;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (vcpu->arch.busy_preempt != TB_NIL) {
+			WARN_ON_ONCE(vcpu->arch.state != KVMPPC_VCPU_BUSY_IN_HOST);
+			vc->stolen_tb += mftb() - vcpu->arch.busy_preempt;
+			vcpu->arch.busy_preempt = TB_NIL;
+		}
+		return;
+	}
+
+	now = mftb();
+
+	/*
+	 * We can test vc->runner without taking the vcore lock,
+	 * because only this task ever sets vc->runner to this
+	 * vcpu, and once it is set to this vcpu, only this task
+	 * ever sets it to NULL.
+	 */
+	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+		kvmppc_core_end_stolen(vc, now);
+
+	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
+	    vcpu->arch.busy_preempt != TB_NIL) {
+		vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
+		vcpu->arch.busy_preempt = TB_NIL;
+	}
+	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+}
+
+static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	unsigned long flags;
+	u64 now;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/*
+		 * In the P9 path, RUNNABLE is not preemptible
+		 * (nor takes host interrupts)
+		 */
+		WARN_ON_ONCE(vcpu->arch.state == KVMPPC_VCPU_RUNNABLE);
+		/*
+		 * Account stolen time when preempted while the vcpu task is
+		 * running in the kernel (but not in qemu, which is INACTIVE).
+		 */
+		if (task_is_running(current) &&
+				vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+			vcpu->arch.busy_preempt = mftb();
+		return;
+	}
+
+	now = mftb();
+
+	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+		kvmppc_core_start_stolen(vc, now);
+
+	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+		vcpu->arch.busy_preempt = now;
+	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+}
+
+static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+}
+
+/* Dummy value used in computing PCR value below */
+#define PCR_ARCH_31    (PCR_ARCH_300 << 1)
+
+static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+{
+	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	/* We can (emulate) our own architecture version and anything older */
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		host_pcr_bit = PCR_ARCH_31;
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
+		host_pcr_bit = PCR_ARCH_300;
+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		host_pcr_bit = PCR_ARCH_207;
+	else if (cpu_has_feature(CPU_FTR_ARCH_206))
+		host_pcr_bit = PCR_ARCH_206;
+	else
+		host_pcr_bit = PCR_ARCH_205;
+
+	/* Determine lowest PCR bit needed to run guest in given PVR level */
+	guest_pcr_bit = host_pcr_bit;
+	if (arch_compat) {
+		switch (arch_compat) {
+		case PVR_ARCH_205:
+			guest_pcr_bit = PCR_ARCH_205;
+			break;
+		case PVR_ARCH_206:
+		case PVR_ARCH_206p:
+			guest_pcr_bit = PCR_ARCH_206;
+			break;
+		case PVR_ARCH_207:
+			guest_pcr_bit = PCR_ARCH_207;
+			break;
+		case PVR_ARCH_300:
+			guest_pcr_bit = PCR_ARCH_300;
+			break;
+		case PVR_ARCH_31:
+			guest_pcr_bit = PCR_ARCH_31;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* Check requested PCR bits don't exceed our capabilities */
+	if (guest_pcr_bit > host_pcr_bit)
+		return -EINVAL;
+
+	spin_lock(&vc->lock);
+	vc->arch_compat = arch_compat;
+	/*
+	 * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
+	 * Also set all reserved PCR bits
+	 */
+	vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
+	spin_unlock(&vc->lock);
+
+	return 0;
+}
+
+static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+	       vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.regs.ctr, vcpu->arch.regs.link);
+	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+	pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
+	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.16lx\n",
+	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+	return kvm_get_vcpu_by_id(kvm, id);
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+	vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
+	vpa->yield_count = cpu_to_be32(1);
+}
+
+static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
+		   unsigned long addr, unsigned long len)
+{
+	/* check address is cacheline aligned */
+	if (addr & (L1_CACHE_BYTES - 1))
+		return -EINVAL;
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (v->next_gpa != addr || v->len != len) {
+		v->next_gpa = addr;
+		v->len = addr ? len : 0;
+		v->update_pending = 1;
+	}
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+	return 0;
+}
+
+/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
+struct reg_vpa {
+	u32 dummy;
+	union {
+		__be16 hword;
+		__be32 word;
+	} length;
+};
+
+static int vpa_is_registered(struct kvmppc_vpa *vpap)
+{
+	if (vpap->update_pending)
+		return vpap->next_gpa != 0;
+	return vpap->pinned_addr != NULL;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+				       unsigned long flags,
+				       unsigned long vcpuid, unsigned long vpa)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long len, nb;
+	void *va;
+	struct kvm_vcpu *tvcpu;
+	int err;
+	int subfunc;
+	struct kvmppc_vpa *vpap;
+
+	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+	if (!tvcpu)
+		return H_PARAMETER;
+
+	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
+	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
+	    subfunc == H_VPA_REG_SLB) {
+		/* Registering new area - address must be cache-line aligned */
+		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
+			return H_PARAMETER;
+
+		/* convert logical addr to kernel addr and read length */
+		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
+		if (va == NULL)
+			return H_PARAMETER;
+		if (subfunc == H_VPA_REG_VPA)
+			len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
+		else
+			len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
+		kvmppc_unpin_guest_page(kvm, va, vpa, false);
+
+		/* Check length */
+		if (len > nb || len < sizeof(struct reg_vpa))
+			return H_PARAMETER;
+	} else {
+		vpa = 0;
+		len = 0;
+	}
+
+	err = H_PARAMETER;
+	vpap = NULL;
+	spin_lock(&tvcpu->arch.vpa_update_lock);
+
+	switch (subfunc) {
+	case H_VPA_REG_VPA:		/* register VPA */
+		/*
+		 * The size of our lppaca is 1kB because of the way we align
+		 * it for the guest to avoid crossing a 4kB boundary. We only
+		 * use 640 bytes of the structure though, so we should accept
+		 * clients that set a size of 640.
+		 */
+		BUILD_BUG_ON(sizeof(struct lppaca) != 640);
+		if (len < sizeof(struct lppaca))
+			break;
+		vpap = &tvcpu->arch.vpa;
+		err = 0;
+		break;
+
+	case H_VPA_REG_DTL:		/* register DTL */
+		if (len < sizeof(struct dtl_entry))
+			break;
+		len -= len % sizeof(struct dtl_entry);
+
+		/* Check that they have previously registered a VPA */
+		err = H_RESOURCE;
+		if (!vpa_is_registered(&tvcpu->arch.vpa))
+			break;
+
+		vpap = &tvcpu->arch.dtl;
+		err = 0;
+		break;
+
+	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
+		/* Check that they have previously registered a VPA */
+		err = H_RESOURCE;
+		if (!vpa_is_registered(&tvcpu->arch.vpa))
+			break;
+
+		vpap = &tvcpu->arch.slb_shadow;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_VPA:		/* deregister VPA */
+		/* Check they don't still have a DTL or SLB buf registered */
+		err = H_RESOURCE;
+		if (vpa_is_registered(&tvcpu->arch.dtl) ||
+		    vpa_is_registered(&tvcpu->arch.slb_shadow))
+			break;
+
+		vpap = &tvcpu->arch.vpa;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_DTL:		/* deregister DTL */
+		vpap = &tvcpu->arch.dtl;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
+		vpap = &tvcpu->arch.slb_shadow;
+		err = 0;
+		break;
+	}
+
+	if (vpap) {
+		vpap->next_gpa = vpa;
+		vpap->len = len;
+		vpap->update_pending = 1;
+	}
+
+	spin_unlock(&tvcpu->arch.vpa_update_lock);
+
+	return err;
+}
+
+static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	void *va;
+	unsigned long nb;
+	unsigned long gpa;
+
+	/*
+	 * We need to pin the page pointed to by vpap->next_gpa,
+	 * but we can't call kvmppc_pin_guest_page under the lock
+	 * as it does get_user_pages() and down_read().  So we
+	 * have to drop the lock, pin the page, then get the lock
+	 * again and check that a new area didn't get registered
+	 * in the meantime.
+	 */
+	for (;;) {
+		gpa = vpap->next_gpa;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		va = NULL;
+		nb = 0;
+		if (gpa)
+			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		if (gpa == vpap->next_gpa)
+			break;
+		/* sigh... unpin that one and try again */
+		if (va)
+			kvmppc_unpin_guest_page(kvm, va, gpa, false);
+	}
+
+	vpap->update_pending = 0;
+	if (va && nb < vpap->len) {
+		/*
+		 * If it's now too short, it must be that userspace
+		 * has changed the mappings underlying guest memory,
+		 * so unregister the region.
+		 */
+		kvmppc_unpin_guest_page(kvm, va, gpa, false);
+		va = NULL;
+	}
+	if (vpap->pinned_addr)
+		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
+					vpap->dirty);
+	vpap->gpa = gpa;
+	vpap->pinned_addr = va;
+	vpap->dirty = false;
+	if (va)
+		vpap->pinned_end = va + vpap->len;
+}
+
+static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.vpa.update_pending ||
+	      vcpu->arch.slb_shadow.update_pending ||
+	      vcpu->arch.dtl.update_pending))
+		return;
+
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (vcpu->arch.vpa.update_pending) {
+		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
+		if (vcpu->arch.vpa.pinned_addr)
+			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+	}
+	if (vcpu->arch.dtl.update_pending) {
+		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
+		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
+		vcpu->arch.dtl_index = 0;
+	}
+	if (vcpu->arch.slb_shadow.update_pending)
+		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+}
+
+/*
+ * Return the accumulated stolen time for the vcore up until `now'.
+ * The caller should hold the vcore lock.
+ */
+static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
+{
+	u64 p;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	spin_lock_irqsave(&vc->stoltb_lock, flags);
+	p = vc->stolen_tb;
+	if (vc->vcore_state != VCORE_INACTIVE &&
+	    vc->preempt_tb != TB_NIL)
+		p += now - vc->preempt_tb;
+	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+	return p;
+}
+
+static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+					struct lppaca *vpa,
+					unsigned int pcpu, u64 now,
+					unsigned long stolen)
+{
+	struct dtl_entry *dt;
+
+	dt = vcpu->arch.dtl_ptr;
+
+	if (!dt)
+		return;
+
+	dt->dispatch_reason = 7;
+	dt->preempt_reason = 0;
+	dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
+	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
+	dt->ready_to_enqueue_time = 0;
+	dt->waiting_to_ready_time = 0;
+	dt->timebase = cpu_to_be64(now);
+	dt->fault_addr = 0;
+	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
+	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
+
+	++dt;
+	if (dt == vcpu->arch.dtl.pinned_end)
+		dt = vcpu->arch.dtl.pinned_addr;
+	vcpu->arch.dtl_ptr = dt;
+	/* order writing *dt vs. writing vpa->dtl_idx */
+	smp_wmb();
+	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
+
+	/* vcpu->arch.dtl.dirty is set by the caller */
+}
+
+static void kvmppc_update_vpa_dispatch(struct kvm_vcpu *vcpu,
+				       struct kvmppc_vcore *vc)
+{
+	struct lppaca *vpa;
+	unsigned long stolen;
+	unsigned long core_stolen;
+	u64 now;
+	unsigned long flags;
+
+	vpa = vcpu->arch.vpa.pinned_addr;
+	if (!vpa)
+		return;
+
+	now = mftb();
+
+	core_stolen = vcore_stolen_time(vc, now);
+	stolen = core_stolen - vcpu->arch.stolen_logged;
+	vcpu->arch.stolen_logged = core_stolen;
+	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+	stolen += vcpu->arch.busy_stolen;
+	vcpu->arch.busy_stolen = 0;
+	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+
+	vpa->enqueue_dispatch_tb = cpu_to_be64(be64_to_cpu(vpa->enqueue_dispatch_tb) + stolen);
+
+	__kvmppc_create_dtl_entry(vcpu, vpa, vc->pcpu, now + vc->tb_offset, stolen);
+
+	vcpu->arch.vpa.dirty = true;
+}
+
+static void kvmppc_update_vpa_dispatch_p9(struct kvm_vcpu *vcpu,
+				       struct kvmppc_vcore *vc,
+				       u64 now)
+{
+	struct lppaca *vpa;
+	unsigned long stolen;
+	unsigned long stolen_delta;
+
+	vpa = vcpu->arch.vpa.pinned_addr;
+	if (!vpa)
+		return;
+
+	stolen = vc->stolen_tb;
+	stolen_delta = stolen - vcpu->arch.stolen_logged;
+	vcpu->arch.stolen_logged = stolen;
+
+	vpa->enqueue_dispatch_tb = cpu_to_be64(stolen);
+
+	__kvmppc_create_dtl_entry(vcpu, vpa, vc->pcpu, now, stolen_delta);
+
+	vcpu->arch.vpa.dirty = true;
+}
+
+/* See if there is a doorbell interrupt pending for a vcpu */
+static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
+{
+	int thr;
+	struct kvmppc_vcore *vc;
+
+	if (vcpu->arch.doorbell_request)
+		return true;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return false;
+	/*
+	 * Ensure that the read of vcore->dpdes comes after the read
+	 * of vcpu->doorbell_request.  This barrier matches the
+	 * smp_wmb() in kvmppc_guest_entry_inject().
+	 */
+	smp_rmb();
+	vc = vcpu->arch.vcore;
+	thr = vcpu->vcpu_id - vc->first_vcpuid;
+	return !!(vc->dpdes & (1 << thr));
+}
+
+static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
+		return true;
+	if ((!vcpu->arch.vcore->arch_compat) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S))
+		return true;
+	return false;
+}
+
+static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
+			     unsigned long resource, unsigned long value1,
+			     unsigned long value2)
+{
+	switch (resource) {
+	case H_SET_MODE_RESOURCE_SET_CIABR:
+		if (!kvmppc_power8_compatible(vcpu))
+			return H_P2;
+		if (value2)
+			return H_P4;
+		if (mflags)
+			return H_UNSUPPORTED_FLAG_START;
+		/* Guests can't breakpoint the hypervisor */
+		if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
+			return H_P3;
+		kvmppc_set_ciabr_hv(vcpu, value1);
+		return H_SUCCESS;
+	case H_SET_MODE_RESOURCE_SET_DAWR0:
+		if (!kvmppc_power8_compatible(vcpu))
+			return H_P2;
+		if (!ppc_breakpoint_available())
+			return H_P2;
+		if (mflags)
+			return H_UNSUPPORTED_FLAG_START;
+		if (value2 & DABRX_HYP)
+			return H_P4;
+		kvmppc_set_dawr0_hv(vcpu, value1);
+		kvmppc_set_dawrx0_hv(vcpu, value2);
+		return H_SUCCESS;
+	case H_SET_MODE_RESOURCE_SET_DAWR1:
+		if (!kvmppc_power8_compatible(vcpu))
+			return H_P2;
+		if (!ppc_breakpoint_available())
+			return H_P2;
+		if (!cpu_has_feature(CPU_FTR_DAWR1))
+			return H_P2;
+		if (!vcpu->kvm->arch.dawr1_enabled)
+			return H_FUNCTION;
+		if (mflags)
+			return H_UNSUPPORTED_FLAG_START;
+		if (value2 & DABRX_HYP)
+			return H_P4;
+		kvmppc_set_dawr1_hv(vcpu, value1);
+		kvmppc_set_dawrx1_hv(vcpu, value2);
+		return H_SUCCESS;
+	case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
+		/*
+		 * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
+		 * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
+		 */
+		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
+				kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
+			return H_UNSUPPORTED_FLAG_START;
+		return H_TOO_HARD;
+	default:
+		return H_TOO_HARD;
+	}
+}
+
+/* Copy guest memory in place - must reside within a single memslot */
+static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
+				  unsigned long len)
+{
+	struct kvm_memory_slot *to_memslot = NULL;
+	struct kvm_memory_slot *from_memslot = NULL;
+	unsigned long to_addr, from_addr;
+	int r;
+
+	/* Get HPA for from address */
+	from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
+	if (!from_memslot)
+		return -EFAULT;
+	if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
+			     << PAGE_SHIFT))
+		return -EINVAL;
+	from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
+	if (kvm_is_error_hva(from_addr))
+		return -EFAULT;
+	from_addr |= (from & (PAGE_SIZE - 1));
+
+	/* Get HPA for to address */
+	to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
+	if (!to_memslot)
+		return -EFAULT;
+	if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
+			   << PAGE_SHIFT))
+		return -EINVAL;
+	to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
+	if (kvm_is_error_hva(to_addr))
+		return -EFAULT;
+	to_addr |= (to & (PAGE_SIZE - 1));
+
+	/* Perform copy */
+	r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
+			     len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty(kvm, to >> PAGE_SHIFT);
+	return 0;
+}
+
+static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
+			       unsigned long dest, unsigned long src)
+{
+	u64 pg_sz = SZ_4K;		/* 4K page size */
+	u64 pg_mask = SZ_4K - 1;
+	int ret;
+
+	/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
+	if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
+		      H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
+		return H_PARAMETER;
+
+	/* dest (and src if copy_page flag set) must be page aligned */
+	if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
+		return H_PARAMETER;
+
+	/* zero and/or copy the page as determined by the flags */
+	if (flags & H_COPY_PAGE) {
+		ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
+		if (ret < 0)
+			return H_PARAMETER;
+	} else if (flags & H_ZERO_PAGE) {
+		ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
+		if (ret < 0)
+			return H_PARAMETER;
+	}
+
+	/* We can ignore the remaining flags */
+
+	return H_SUCCESS;
+}
+
+static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
+{
+	struct kvmppc_vcore *vcore = target->arch.vcore;
+
+	/*
+	 * We expect to have been called by the real mode handler
+	 * (kvmppc_rm_h_confer()) which would have directly returned
+	 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
+	 * have useful work to do and should not confer) so we don't
+	 * recheck that here.
+	 *
+	 * In the case of the P9 single vcpu per vcore case, the real
+	 * mode handler is not called but no other threads are in the
+	 * source vcore.
+	 */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		spin_lock(&vcore->lock);
+		if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
+		    vcore->vcore_state != VCORE_INACTIVE &&
+		    vcore->runner)
+			target = vcore->runner;
+		spin_unlock(&vcore->lock);
+	}
+
+	return kvm_vcpu_yield_to(target);
+}
+
+static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
+{
+	int yield_count = 0;
+	struct lppaca *lppaca;
+
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
+	if (lppaca)
+		yield_count = be32_to_cpu(lppaca->yield_count);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+	return yield_count;
+}
+
+/*
+ * H_RPT_INVALIDATE hcall handler for nested guests.
+ *
+ * Handles only nested process-scoped invalidation requests in L0.
+ */
+static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
+{
+	unsigned long type = kvmppc_get_gpr(vcpu, 6);
+	unsigned long pid, pg_sizes, start, end;
+
+	/*
+	 * The partition-scoped invalidations aren't handled here in L0.
+	 */
+	if (type & H_RPTI_TYPE_NESTED)
+		return RESUME_HOST;
+
+	pid = kvmppc_get_gpr(vcpu, 4);
+	pg_sizes = kvmppc_get_gpr(vcpu, 7);
+	start = kvmppc_get_gpr(vcpu, 8);
+	end = kvmppc_get_gpr(vcpu, 9);
+
+	do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
+				type, pg_sizes, start, end);
+
+	kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+	return RESUME_GUEST;
+}
+
+static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
+				    unsigned long id, unsigned long target,
+				    unsigned long type, unsigned long pg_sizes,
+				    unsigned long start, unsigned long end)
+{
+	if (!kvm_is_radix(vcpu->kvm))
+		return H_UNSUPPORTED;
+
+	if (end < start)
+		return H_P5;
+
+	/*
+	 * Partition-scoped invalidation for nested guests.
+	 */
+	if (type & H_RPTI_TYPE_NESTED) {
+		if (!nesting_enabled(vcpu->kvm))
+			return H_FUNCTION;
+
+		/* Support only cores as target */
+		if (target != H_RPTI_TARGET_CMMU)
+			return H_P2;
+
+		return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
+					       start, end);
+	}
+
+	/*
+	 * Process-scoped invalidation for L1 guests.
+	 */
+	do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
+				type, pg_sizes, start, end);
+	return H_SUCCESS;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long req = kvmppc_get_gpr(vcpu, 3);
+	unsigned long target, ret = H_SUCCESS;
+	int yield_count;
+	struct kvm_vcpu *tvcpu;
+	int idx, rc;
+
+	if (req <= MAX_HCALL_OPCODE &&
+	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
+		return RESUME_HOST;
+
+	switch (req) {
+	case H_REMOVE:
+		ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_ENTER:
+		ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6),
+					kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_READ:
+		ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_CLEAR_MOD:
+		ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_CLEAR_REF:
+		ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_PROTECT:
+		ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_BULK_REMOVE:
+		ret = kvmppc_h_bulk_remove(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+
+	case H_CEDE:
+		break;
+	case H_PROD:
+		target = kvmppc_get_gpr(vcpu, 4);
+		tvcpu = kvmppc_find_vcpu(kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		tvcpu->arch.prodded = 1;
+		smp_mb(); /* This orders prodded store vs ceded load */
+		if (tvcpu->arch.ceded)
+			kvmppc_fast_vcpu_kick_hv(tvcpu);
+		break;
+	case H_CONFER:
+		target = kvmppc_get_gpr(vcpu, 4);
+		if (target == -1)
+			break;
+		tvcpu = kvmppc_find_vcpu(kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		yield_count = kvmppc_get_gpr(vcpu, 5);
+		if (kvmppc_get_yield_count(tvcpu) != yield_count)
+			break;
+		kvm_arch_vcpu_yield_to(tvcpu);
+		break;
+	case H_REGISTER_VPA:
+		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		break;
+	case H_RTAS:
+		if (list_empty(&kvm->arch.rtas_tokens))
+			return RESUME_HOST;
+
+		idx = srcu_read_lock(&kvm->srcu);
+		rc = kvmppc_rtas_hcall(vcpu);
+		srcu_read_unlock(&kvm->srcu, idx);
+
+		if (rc == -ENOENT)
+			return RESUME_HOST;
+		else if (rc == 0)
+			break;
+
+		/* Send the error out to userspace via KVM_RUN */
+		return rc;
+	case H_LOGICAL_CI_LOAD:
+		ret = kvmppc_h_logical_ci_load(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_LOGICAL_CI_STORE:
+		ret = kvmppc_h_logical_ci_store(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_SET_MODE:
+		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6),
+					kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+		if (kvmppc_xics_enabled(vcpu)) {
+			if (xics_on_xive()) {
+				ret = H_NOT_AVAILABLE;
+				return RESUME_GUEST;
+			}
+			ret = kvmppc_xics_hcall(vcpu, req);
+			break;
+		}
+		return RESUME_HOST;
+	case H_SET_DABR:
+		ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_SET_XDABR:
+		ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5));
+		break;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case H_GET_TCE:
+		ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_PUT_TCE:
+		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_PUT_TCE_INDIRECT:
+		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_STUFF_TCE:
+		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+#endif
+	case H_RANDOM:
+		if (!arch_get_random_seed_longs(&vcpu->arch.regs.gpr[4], 1))
+			ret = H_HARDWARE;
+		break;
+	case H_RPT_INVALIDATE:
+		ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
+					      kvmppc_get_gpr(vcpu, 5),
+					      kvmppc_get_gpr(vcpu, 6),
+					      kvmppc_get_gpr(vcpu, 7),
+					      kvmppc_get_gpr(vcpu, 8),
+					      kvmppc_get_gpr(vcpu, 9));
+		break;
+
+	case H_SET_PARTITION_TABLE:
+		ret = H_FUNCTION;
+		if (nesting_enabled(kvm))
+			ret = kvmhv_set_partition_table(vcpu);
+		break;
+	case H_ENTER_NESTED:
+		ret = H_FUNCTION;
+		if (!nesting_enabled(kvm))
+			break;
+		ret = kvmhv_enter_nested_guest(vcpu);
+		if (ret == H_INTERRUPT) {
+			kvmppc_set_gpr(vcpu, 3, 0);
+			vcpu->arch.hcall_needed = 0;
+			return -EINTR;
+		} else if (ret == H_TOO_HARD) {
+			kvmppc_set_gpr(vcpu, 3, 0);
+			vcpu->arch.hcall_needed = 0;
+			return RESUME_HOST;
+		}
+		break;
+	case H_TLB_INVALIDATE:
+		ret = H_FUNCTION;
+		if (nesting_enabled(kvm))
+			ret = kvmhv_do_nested_tlbie(vcpu);
+		break;
+	case H_COPY_TOFROM_GUEST:
+		ret = H_FUNCTION;
+		if (nesting_enabled(kvm))
+			ret = kvmhv_copy_tofrom_guest_nested(vcpu);
+		break;
+	case H_PAGE_INIT:
+		ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
+					 kvmppc_get_gpr(vcpu, 5),
+					 kvmppc_get_gpr(vcpu, 6));
+		break;
+	case H_SVM_PAGE_IN:
+		ret = H_UNSUPPORTED;
+		if (kvmppc_get_srr1(vcpu) & MSR_S)
+			ret = kvmppc_h_svm_page_in(kvm,
+						   kvmppc_get_gpr(vcpu, 4),
+						   kvmppc_get_gpr(vcpu, 5),
+						   kvmppc_get_gpr(vcpu, 6));
+		break;
+	case H_SVM_PAGE_OUT:
+		ret = H_UNSUPPORTED;
+		if (kvmppc_get_srr1(vcpu) & MSR_S)
+			ret = kvmppc_h_svm_page_out(kvm,
+						    kvmppc_get_gpr(vcpu, 4),
+						    kvmppc_get_gpr(vcpu, 5),
+						    kvmppc_get_gpr(vcpu, 6));
+		break;
+	case H_SVM_INIT_START:
+		ret = H_UNSUPPORTED;
+		if (kvmppc_get_srr1(vcpu) & MSR_S)
+			ret = kvmppc_h_svm_init_start(kvm);
+		break;
+	case H_SVM_INIT_DONE:
+		ret = H_UNSUPPORTED;
+		if (kvmppc_get_srr1(vcpu) & MSR_S)
+			ret = kvmppc_h_svm_init_done(kvm);
+		break;
+	case H_SVM_INIT_ABORT:
+		/*
+		 * Even if that call is made by the Ultravisor, the SSR1 value
+		 * is the guest context one, with the secure bit clear as it has
+		 * not yet been secured. So we can't check it here.
+		 * Instead the kvm->arch.secure_guest flag is checked inside
+		 * kvmppc_h_svm_init_abort().
+		 */
+		ret = kvmppc_h_svm_init_abort(kvm);
+		break;
+
+	default:
+		return RESUME_HOST;
+	}
+	WARN_ON_ONCE(ret == H_TOO_HARD);
+	kvmppc_set_gpr(vcpu, 3, ret);
+	vcpu->arch.hcall_needed = 0;
+	return RESUME_GUEST;
+}
+
+/*
+ * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
+ * handlers in book3s_hv_rmhandlers.S.
+ *
+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
+ */
+static void kvmppc_cede(struct kvm_vcpu *vcpu)
+{
+	__kvmppc_set_msr_hv(vcpu, __kvmppc_get_msr_hv(vcpu) | MSR_EE);
+	vcpu->arch.ceded = 1;
+	smp_mb();
+	if (vcpu->arch.prodded) {
+		vcpu->arch.prodded = 0;
+		smp_mb();
+		vcpu->arch.ceded = 0;
+	}
+}
+
+static int kvmppc_hcall_impl_hv(unsigned long cmd)
+{
+	switch (cmd) {
+	case H_CEDE:
+	case H_PROD:
+	case H_CONFER:
+	case H_REGISTER_VPA:
+	case H_SET_MODE:
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case H_GET_TCE:
+	case H_PUT_TCE:
+	case H_PUT_TCE_INDIRECT:
+	case H_STUFF_TCE:
+#endif
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
+#ifdef CONFIG_KVM_XICS
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+#endif
+	case H_PAGE_INIT:
+	case H_RPT_INVALIDATE:
+		return 1;
+	}
+
+	/* See if it's in the real-mode table */
+	return kvmppc_hcall_impl_hv_realmode(cmd);
+}
+
+static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
+{
+	ppc_inst_t last_inst;
+
+	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
+					EMULATE_DONE) {
+		/*
+		 * Fetch failed, so return to guest and
+		 * try executing it again.
+		 */
+		return RESUME_GUEST;
+	}
+
+	if (ppc_inst_val(last_inst) == KVMPPC_INST_SW_BREAKPOINT) {
+		vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+		vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
+		return RESUME_HOST;
+	} else {
+		kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
+				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
+		return RESUME_GUEST;
+	}
+}
+
+static void do_nothing(void *x)
+{
+}
+
+static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
+{
+	int thr, cpu, pcpu, nthreads;
+	struct kvm_vcpu *v;
+	unsigned long dpdes;
+
+	nthreads = vcpu->kvm->arch.emul_smt_mode;
+	dpdes = 0;
+	cpu = vcpu->vcpu_id & ~(nthreads - 1);
+	for (thr = 0; thr < nthreads; ++thr, ++cpu) {
+		v = kvmppc_find_vcpu(vcpu->kvm, cpu);
+		if (!v)
+			continue;
+		/*
+		 * If the vcpu is currently running on a physical cpu thread,
+		 * interrupt it in order to pull it out of the guest briefly,
+		 * which will update its vcore->dpdes value.
+		 */
+		pcpu = READ_ONCE(v->cpu);
+		if (pcpu >= 0)
+			smp_call_function_single(pcpu, do_nothing, NULL, 1);
+		if (kvmppc_doorbell_pending(v))
+			dpdes |= 1 << thr;
+	}
+	return dpdes;
+}
+
+/*
+ * On POWER9, emulate doorbell-related instructions in order to
+ * give the guest the illusion of running on a multi-threaded core.
+ * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
+ * and mfspr DPDES.
+ */
+static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
+{
+	u32 inst, rb, thr;
+	unsigned long arg;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *tvcpu;
+	ppc_inst_t pinst;
+
+	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst) != EMULATE_DONE)
+		return RESUME_GUEST;
+	inst = ppc_inst_val(pinst);
+	if (get_op(inst) != 31)
+		return EMULATE_FAIL;
+	rb = get_rb(inst);
+	thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
+	switch (get_xop(inst)) {
+	case OP_31_XOP_MSGSNDP:
+		arg = kvmppc_get_gpr(vcpu, rb);
+		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
+			break;
+		arg &= 0x7f;
+		if (arg >= kvm->arch.emul_smt_mode)
+			break;
+		tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
+		if (!tvcpu)
+			break;
+		if (!tvcpu->arch.doorbell_request) {
+			tvcpu->arch.doorbell_request = 1;
+			kvmppc_fast_vcpu_kick_hv(tvcpu);
+		}
+		break;
+	case OP_31_XOP_MSGCLRP:
+		arg = kvmppc_get_gpr(vcpu, rb);
+		if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
+			break;
+		vcpu->arch.vcore->dpdes = 0;
+		vcpu->arch.doorbell_request = 0;
+		break;
+	case OP_31_XOP_MFSPR:
+		switch (get_sprn(inst)) {
+		case SPRN_TIR:
+			arg = thr;
+			break;
+		case SPRN_DPDES:
+			arg = kvmppc_read_dpdes(vcpu);
+			break;
+		default:
+			return EMULATE_FAIL;
+		}
+		kvmppc_set_gpr(vcpu, get_rt(inst), arg);
+		break;
+	default:
+		return EMULATE_FAIL;
+	}
+	kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+	return RESUME_GUEST;
+}
+
+/*
+ * If the lppaca had pmcregs_in_use clear when we exited the guest, then
+ * HFSCR_PM is cleared for next entry. If the guest then tries to access
+ * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
+ * back in the guest HFSCR will cause the next entry to load the PMU SPRs and
+ * allow the guest access to continue.
+ */
+static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
+		return EMULATE_FAIL;
+
+	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PM);
+
+	return RESUME_GUEST;
+}
+
+static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
+		return EMULATE_FAIL;
+
+	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_EBB);
+
+	return RESUME_GUEST;
+}
+
+static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
+		return EMULATE_FAIL;
+
+	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM);
+
+	return RESUME_GUEST;
+}
+
+static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
+				 struct task_struct *tsk)
+{
+	struct kvm_run *run = vcpu->run;
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	/*
+	 * This can happen if an interrupt occurs in the last stages
+	 * of guest entry or the first stages of guest exit (i.e. after
+	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
+	 * That can happen due to a bug, or due to a machine check
+	 * occurring at just the wrong time.
+	 */
+	if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) {
+		printk(KERN_EMERG "KVM trap in HV mode!\n");
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		kvmppc_dump_regs(vcpu);
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->hw.hardware_exit_reason = vcpu->arch.trap;
+		return RESUME_HOST;
+	}
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
+		WARN_ON_ONCE(1); /* Should never happen */
+		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+		fallthrough;
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+	case BOOK3S_INTERRUPT_H_VIRT:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+	case BOOK3S_INTERRUPT_HMI:
+	case BOOK3S_INTERRUPT_PERFMON:
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK: {
+		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+		/*
+		 * Print the MCE event to host console. Ratelimit so the guest
+		 * can't flood the host log.
+		 */
+		if (__ratelimit(&rs))
+			machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
+
+		/*
+		 * If the guest can do FWNMI, exit to userspace so it can
+		 * deliver a FWNMI to the guest.
+		 * Otherwise we synthesize a machine check for the guest
+		 * so that it knows that the machine check occurred.
+		 */
+		if (!vcpu->kvm->arch.fwnmi_enabled) {
+			ulong flags = (__kvmppc_get_msr_hv(vcpu) & 0x083c0000) |
+					(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
+			kvmppc_core_queue_machine_check(vcpu, flags);
+			r = RESUME_GUEST;
+			break;
+		}
+
+		/* Exit to guest with KVM_EXIT_NMI as exit reason */
+		run->exit_reason = KVM_EXIT_NMI;
+		run->hw.hardware_exit_reason = vcpu->arch.trap;
+		/* Clear out the old NMI status from run->flags */
+		run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
+		/* Now set the NMI status */
+		if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
+			run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
+		else
+			run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
+
+		r = RESUME_HOST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = (__kvmppc_get_msr_hv(vcpu) & 0x1f0000ull) |
+			(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		int i;
+
+		if (unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
+			/*
+			 * Guest userspace executed sc 1. This can only be
+			 * reached by the P9 path because the old path
+			 * handles this case in realmode hcall handlers.
+			 */
+			if (!kvmhv_vcpu_is_radix(vcpu)) {
+				/*
+				 * A guest could be running PR KVM, so this
+				 * may be a PR KVM hcall. It must be reflected
+				 * to the guest kernel as a sc interrupt.
+				 */
+				kvmppc_core_queue_syscall(vcpu);
+			} else {
+				/*
+				 * Radix guests can not run PR KVM or nested HV
+				 * hash guests which might run PR KVM, so this
+				 * is always a privilege fault. Send a program
+				 * check to guest kernel.
+				 */
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+			}
+			r = RESUME_GUEST;
+			break;
+		}
+
+		/*
+		 * hcall - gather args and set exit_reason. This will next be
+		 * handled by kvmppc_pseries_do_hcall which may be able to deal
+		 * with it and resume guest, or may punt to userspace.
+		 */
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	/*
+	 * We get these next two if the guest accesses a page which it thinks
+	 * it has mapped but which is not actually present, either because
+	 * it is for an emulated I/O device or because the corresonding
+	 * host page has been paged out.
+	 *
+	 * Any other HDSI/HISI interrupts have been handled already for P7/8
+	 * guests. For POWER9 hash guests not using rmhandlers, basic hash
+	 * fault handling is done here.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
+		unsigned long vsid;
+		long err;
+
+		if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
+		    unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
+			r = RESUME_GUEST; /* Just retry if it's the canary */
+			break;
+		}
+
+		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+			/*
+			 * Radix doesn't require anything, and pre-ISAv3.0 hash
+			 * already attempted to handle this in rmhandlers. The
+			 * hash fault handling below is v3 only (it uses ASDR
+			 * via fault_gpa).
+			 */
+			r = RESUME_PAGE_FAULT;
+			break;
+		}
+
+		if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
+			kvmppc_core_queue_data_storage(vcpu,
+				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+			r = RESUME_GUEST;
+			break;
+		}
+
+		if (!(__kvmppc_get_msr_hv(vcpu) & MSR_DR))
+			vsid = vcpu->kvm->arch.vrma_slb_v;
+		else
+			vsid = vcpu->arch.fault_gpa;
+
+		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+				vsid, vcpu->arch.fault_dsisr, true);
+		if (err == 0) {
+			r = RESUME_GUEST;
+		} else if (err == -1 || err == -2) {
+			r = RESUME_PAGE_FAULT;
+		} else {
+			kvmppc_core_queue_data_storage(vcpu,
+				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+				vcpu->arch.fault_dar, err);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_H_INST_STORAGE: {
+		unsigned long vsid;
+		long err;
+
+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+		vcpu->arch.fault_dsisr = __kvmppc_get_msr_hv(vcpu) &
+			DSISR_SRR1_MATCH_64S;
+		if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+			/*
+			 * Radix doesn't require anything, and pre-ISAv3.0 hash
+			 * already attempted to handle this in rmhandlers. The
+			 * hash fault handling below is v3 only (it uses ASDR
+			 * via fault_gpa).
+			 */
+			if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
+				vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+			r = RESUME_PAGE_FAULT;
+			break;
+		}
+
+		if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
+			kvmppc_core_queue_inst_storage(vcpu,
+				vcpu->arch.fault_dsisr |
+				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
+			r = RESUME_GUEST;
+			break;
+		}
+
+		if (!(__kvmppc_get_msr_hv(vcpu) & MSR_IR))
+			vsid = vcpu->kvm->arch.vrma_slb_v;
+		else
+			vsid = vcpu->arch.fault_gpa;
+
+		err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+				vsid, vcpu->arch.fault_dsisr, false);
+		if (err == 0) {
+			r = RESUME_GUEST;
+		} else if (err == -1) {
+			r = RESUME_PAGE_FAULT;
+		} else {
+			kvmppc_core_queue_inst_storage(vcpu,
+				err | (kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+
+	/*
+	 * This occurs if the guest executes an illegal instruction.
+	 * If the guest debug is disabled, generate a program interrupt
+	 * to the guest. If guest debug is enabled, we need to check
+	 * whether the instruction is a software breakpoint instruction.
+	 * Accordingly return to Guest or Host.
+	 */
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
+			vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
+				swab32(vcpu->arch.emul_inst) :
+				vcpu->arch.emul_inst;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
+			r = kvmppc_emulate_debug_inst(vcpu);
+		} else {
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
+				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
+			r = RESUME_GUEST;
+		}
+		break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+		/*
+		 * This occurs for various TM-related instructions that
+		 * we need to emulate on POWER9 DD2.2.  We have already
+		 * handled the cases where the guest was in real-suspend
+		 * mode and was transitioning to transactional state.
+		 */
+		r = kvmhv_p9_tm_emulation(vcpu);
+		if (r != -1)
+			break;
+		fallthrough; /* go to facility unavailable handler */
+#endif
+
+	/*
+	 * This occurs if the guest (kernel or userspace), does something that
+	 * is prohibited by HFSCR.
+	 * On POWER9, this could be a doorbell instruction that we need
+	 * to emulate.
+	 * Otherwise, we just generate a program interrupt to the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
+		u64 cause = kvmppc_get_hfscr_hv(vcpu) >> 56;
+
+		r = EMULATE_FAIL;
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			if (cause == FSCR_MSGP_LG)
+				r = kvmppc_emulate_doorbell_instr(vcpu);
+			if (cause == FSCR_PM_LG)
+				r = kvmppc_pmu_unavailable(vcpu);
+			if (cause == FSCR_EBB_LG)
+				r = kvmppc_ebb_unavailable(vcpu);
+			if (cause == FSCR_TM_LG)
+				r = kvmppc_tm_unavailable(vcpu);
+		}
+		if (r == EMULATE_FAIL) {
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
+				(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+
+	case BOOK3S_INTERRUPT_HV_RM_HARD:
+		r = RESUME_PASSTHROUGH;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			__kvmppc_get_msr_hv(vcpu));
+		run->hw.hardware_exit_reason = vcpu->arch.trap;
+		r = RESUME_HOST;
+		break;
+	}
+
+	return r;
+}
+
+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+{
+	int r;
+	int srcu_idx;
+
+	vcpu->stat.sum_exits++;
+
+	/*
+	 * This can happen if an interrupt occurs in the last stages
+	 * of guest entry or the first stages of guest exit (i.e. after
+	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
+	 * That can happen due to a bug, or due to a machine check
+	 * occurring at just the wrong time.
+	 */
+	if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) {
+		pr_emerg("KVM trap in HV mode while nested!\n");
+		pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			 vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			 __kvmppc_get_msr_hv(vcpu));
+		kvmppc_dump_regs(vcpu);
+		return RESUME_HOST;
+	}
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_HOST;
+		break;
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+	case BOOK3S_INTERRUPT_H_VIRT:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	/* These need to go to the nested HV */
+	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
+		vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+		vcpu->stat.dec_exits++;
+		r = RESUME_HOST;
+		break;
+	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+	case BOOK3S_INTERRUPT_HMI:
+	case BOOK3S_INTERRUPT_PERFMON:
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+	{
+		static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+		/* Pass the machine check to the L1 guest */
+		r = RESUME_HOST;
+		/* Print the MCE event to host console. */
+		if (__ratelimit(&rs))
+			machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
+		break;
+	}
+	/*
+	 * We get these next two if the guest accesses a page which it thinks
+	 * it has mapped but which is not actually present, either because
+	 * it is for an emulated I/O device or because the corresonding
+	 * host page has been paged out.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = kvmhv_nested_page_fault(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
+					 DSISR_SRR1_MATCH_64S;
+		if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
+			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = kvmhv_nested_page_fault(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+		/*
+		 * This occurs for various TM-related instructions that
+		 * we need to emulate on POWER9 DD2.2.  We have already
+		 * handled the cases where the guest was in real-suspend
+		 * mode and was transitioning to transactional state.
+		 */
+		r = kvmhv_p9_tm_emulation(vcpu);
+		if (r != -1)
+			break;
+		fallthrough; /* go to facility unavailable handler */
+#endif
+
+	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
+		u64 cause = vcpu->arch.hfscr >> 56;
+
+		/*
+		 * Only pass HFU interrupts to the L1 if the facility is
+		 * permitted but disabled by the L1's HFSCR, otherwise
+		 * the interrupt does not make sense to the L1 so turn
+		 * it into a HEAI.
+		 */
+		if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
+				(vcpu->arch.nested_hfscr & (1UL << cause))) {
+			ppc_inst_t pinst;
+			vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
+
+			/*
+			 * If the fetch failed, return to guest and
+			 * try executing it again.
+			 */
+			r = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst);
+			vcpu->arch.emul_inst = ppc_inst_val(pinst);
+			if (r != EMULATE_DONE)
+				r = RESUME_GUEST;
+			else
+				r = RESUME_HOST;
+		} else {
+			r = RESUME_HOST;
+		}
+
+		break;
+	}
+
+	case BOOK3S_INTERRUPT_HV_RM_HARD:
+		vcpu->arch.trap = 0;
+		r = RESUME_GUEST;
+		if (!xics_on_xive())
+			kvmppc_xics_rm_complete(vcpu, 0);
+		break;
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		unsigned long req = kvmppc_get_gpr(vcpu, 3);
+
+		/*
+		 * The H_RPT_INVALIDATE hcalls issued by nested
+		 * guests for process-scoped invalidations when
+		 * GTSE=0, are handled here in L0.
+		 */
+		if (req == H_RPT_INVALIDATE) {
+			r = kvmppc_nested_h_rpt_invalidate(vcpu);
+			break;
+		}
+
+		r = RESUME_HOST;
+		break;
+	}
+	default:
+		r = RESUME_HOST;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	int i;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	sregs->pvr = vcpu->arch.pvr;
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	/* Only accept the same PVR as the host's, since we can't spoof it */
+	if (sregs->pvr != vcpu->arch.pvr)
+		return -EINVAL;
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+/*
+ * Enforce limits on guest LPCR values based on hardware availability,
+ * guest configuration, and possibly hypervisor support and security
+ * concerns.
+ */
+unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
+{
+	/* LPCR_TC only applies to HPT guests */
+	if (kvm_is_radix(kvm))
+		lpcr &= ~LPCR_TC;
+
+	/* On POWER8 and above, userspace can modify AIL */
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		lpcr &= ~LPCR_AIL;
+	if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
+		lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
+	/*
+	 * On some POWER9s we force AIL off for radix guests to prevent
+	 * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
+	 * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
+	 * be cached, which the host TLB management does not expect.
+	 */
+	if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+		lpcr &= ~LPCR_AIL;
+
+	/*
+	 * On POWER9, allow userspace to enable large decrementer for the
+	 * guest, whether or not the host has it enabled.
+	 */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		lpcr &= ~LPCR_LD;
+
+	return lpcr;
+}
+
+static void verify_lpcr(struct kvm *kvm, unsigned long lpcr)
+{
+	if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
+		WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n",
+			  lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
+	}
+}
+
+static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
+		bool preserve_top32)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	u64 mask;
+
+	spin_lock(&vc->lock);
+
+	/*
+	 * Userspace can only modify
+	 * DPFD (default prefetch depth), ILE (interrupt little-endian),
+	 * TC (translation control), AIL (alternate interrupt location),
+	 * LD (large decrementer).
+	 * These are subject to restrictions from kvmppc_filter_lcpr_hv().
+	 */
+	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD;
+
+	/* Broken 32-bit version of LPCR must not clear top bits */
+	if (preserve_top32)
+		mask &= 0xFFFFFFFF;
+
+	new_lpcr = kvmppc_filter_lpcr_hv(kvm,
+			(vc->lpcr & ~mask) | (new_lpcr & mask));
+
+	/*
+	 * If ILE (interrupt little-endian) has changed, update the
+	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
+	 */
+	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
+		struct kvm_vcpu *vcpu;
+		unsigned long i;
+
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (vcpu->arch.vcore != vc)
+				continue;
+			if (new_lpcr & LPCR_ILE)
+				vcpu->arch.intr_msr |= MSR_LE;
+			else
+				vcpu->arch.intr_msr &= ~MSR_LE;
+		}
+	}
+
+	vc->lpcr = new_lpcr;
+
+	spin_unlock(&vc->lock);
+}
+
+static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_DEBUG_INST:
+		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
+		break;
+	case KVM_REG_PPC_HIOR:
+		*val = get_reg_val(id, 0);
+		break;
+	case KVM_REG_PPC_DABR:
+		*val = get_reg_val(id, vcpu->arch.dabr);
+		break;
+	case KVM_REG_PPC_DABRX:
+		*val = get_reg_val(id, vcpu->arch.dabrx);
+		break;
+	case KVM_REG_PPC_DSCR:
+		*val = get_reg_val(id, kvmppc_get_dscr_hv(vcpu));
+		break;
+	case KVM_REG_PPC_PURR:
+		*val = get_reg_val(id, kvmppc_get_purr_hv(vcpu));
+		break;
+	case KVM_REG_PPC_SPURR:
+		*val = get_reg_val(id, kvmppc_get_spurr_hv(vcpu));
+		break;
+	case KVM_REG_PPC_AMR:
+		*val = get_reg_val(id, kvmppc_get_amr_hv(vcpu));
+		break;
+	case KVM_REG_PPC_UAMOR:
+		*val = get_reg_val(id, kvmppc_get_uamor_hv(vcpu));
+		break;
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
+		i = id - KVM_REG_PPC_MMCR0;
+		*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i));
+		break;
+	case KVM_REG_PPC_MMCR2:
+		*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 2));
+		break;
+	case KVM_REG_PPC_MMCRA:
+		*val = get_reg_val(id, kvmppc_get_mmcra_hv(vcpu));
+		break;
+	case KVM_REG_PPC_MMCRS:
+		*val = get_reg_val(id, vcpu->arch.mmcrs);
+		break;
+	case KVM_REG_PPC_MMCR3:
+		*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 3));
+		break;
+	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+		i = id - KVM_REG_PPC_PMC1;
+		*val = get_reg_val(id, kvmppc_get_pmc_hv(vcpu, i));
+		break;
+	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+		i = id - KVM_REG_PPC_SPMC1;
+		*val = get_reg_val(id, vcpu->arch.spmc[i]);
+		break;
+	case KVM_REG_PPC_SIAR:
+		*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu));
+		break;
+	case KVM_REG_PPC_SDAR:
+		*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu));
+		break;
+	case KVM_REG_PPC_SIER:
+		*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 0));
+		break;
+	case KVM_REG_PPC_SIER2:
+		*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 1));
+		break;
+	case KVM_REG_PPC_SIER3:
+		*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 2));
+		break;
+	case KVM_REG_PPC_IAMR:
+		*val = get_reg_val(id, kvmppc_get_iamr_hv(vcpu));
+		break;
+	case KVM_REG_PPC_PSPB:
+		*val = get_reg_val(id, kvmppc_get_pspb_hv(vcpu));
+		break;
+	case KVM_REG_PPC_DPDES:
+		/*
+		 * On POWER9, where we are emulating msgsndp etc.,
+		 * we return 1 bit for each vcpu, which can come from
+		 * either vcore->dpdes or doorbell_request.
+		 * On POWER8, doorbell_request is 0.
+		 */
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			*val = get_reg_val(id, vcpu->arch.doorbell_request);
+		else
+			*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
+		break;
+	case KVM_REG_PPC_VTB:
+		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
+		break;
+	case KVM_REG_PPC_DAWR:
+		*val = get_reg_val(id, kvmppc_get_dawr0_hv(vcpu));
+		break;
+	case KVM_REG_PPC_DAWRX:
+		*val = get_reg_val(id, kvmppc_get_dawrx0_hv(vcpu));
+		break;
+	case KVM_REG_PPC_DAWR1:
+		*val = get_reg_val(id, kvmppc_get_dawr1_hv(vcpu));
+		break;
+	case KVM_REG_PPC_DAWRX1:
+		*val = get_reg_val(id, kvmppc_get_dawrx1_hv(vcpu));
+		break;
+	case KVM_REG_PPC_CIABR:
+		*val = get_reg_val(id, kvmppc_get_ciabr_hv(vcpu));
+		break;
+	case KVM_REG_PPC_CSIGR:
+		*val = get_reg_val(id, vcpu->arch.csigr);
+		break;
+	case KVM_REG_PPC_TACR:
+		*val = get_reg_val(id, vcpu->arch.tacr);
+		break;
+	case KVM_REG_PPC_TCSCR:
+		*val = get_reg_val(id, vcpu->arch.tcscr);
+		break;
+	case KVM_REG_PPC_PID:
+		*val = get_reg_val(id, vcpu->arch.pid);
+		break;
+	case KVM_REG_PPC_ACOP:
+		*val = get_reg_val(id, vcpu->arch.acop);
+		break;
+	case KVM_REG_PPC_WORT:
+		*val = get_reg_val(id, kvmppc_get_wort_hv(vcpu));
+		break;
+	case KVM_REG_PPC_TIDR:
+		*val = get_reg_val(id, vcpu->arch.tid);
+		break;
+	case KVM_REG_PPC_PSSCR:
+		*val = get_reg_val(id, vcpu->arch.psscr);
+		break;
+	case KVM_REG_PPC_VPA_ADDR:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_VPA_SLB:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
+		val->vpaval.length = vcpu->arch.slb_shadow.len;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_VPA_DTL:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
+		val->vpaval.length = vcpu->arch.dtl.len;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_TB_OFFSET:
+		*val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
+		break;
+	case KVM_REG_PPC_LPCR:
+	case KVM_REG_PPC_LPCR_64:
+		*val = get_reg_val(id, vcpu->arch.vcore->lpcr);
+		break;
+	case KVM_REG_PPC_PPR:
+		*val = get_reg_val(id, kvmppc_get_ppr_hv(vcpu));
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_REG_PPC_TFHAR:
+		*val = get_reg_val(id, vcpu->arch.tfhar);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		*val = get_reg_val(id, vcpu->arch.tfiar);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		*val = get_reg_val(id, vcpu->arch.texasr);
+		break;
+	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+		i = id - KVM_REG_PPC_TM_GPR0;
+		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
+		break;
+	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+	{
+		int j;
+		i = id - KVM_REG_PPC_TM_VSR0;
+		if (i < 32)
+			for (j = 0; j < TS_FPRWIDTH; j++)
+				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+		else {
+			if (cpu_has_feature(CPU_FTR_ALTIVEC))
+				val->vval = vcpu->arch.vr_tm.vr[i-32];
+			else
+				r = -ENXIO;
+		}
+		break;
+	}
+	case KVM_REG_PPC_TM_CR:
+		*val = get_reg_val(id, vcpu->arch.cr_tm);
+		break;
+	case KVM_REG_PPC_TM_XER:
+		*val = get_reg_val(id, vcpu->arch.xer_tm);
+		break;
+	case KVM_REG_PPC_TM_LR:
+		*val = get_reg_val(id, vcpu->arch.lr_tm);
+		break;
+	case KVM_REG_PPC_TM_CTR:
+		*val = get_reg_val(id, vcpu->arch.ctr_tm);
+		break;
+	case KVM_REG_PPC_TM_FPSCR:
+		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+		break;
+	case KVM_REG_PPC_TM_AMR:
+		*val = get_reg_val(id, vcpu->arch.amr_tm);
+		break;
+	case KVM_REG_PPC_TM_PPR:
+		*val = get_reg_val(id, vcpu->arch.ppr_tm);
+		break;
+	case KVM_REG_PPC_TM_VRSAVE:
+		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
+		break;
+	case KVM_REG_PPC_TM_VSCR:
+		if (cpu_has_feature(CPU_FTR_ALTIVEC))
+			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+		else
+			r = -ENXIO;
+		break;
+	case KVM_REG_PPC_TM_DSCR:
+		*val = get_reg_val(id, vcpu->arch.dscr_tm);
+		break;
+	case KVM_REG_PPC_TM_TAR:
+		*val = get_reg_val(id, vcpu->arch.tar_tm);
+		break;
+#endif
+	case KVM_REG_PPC_ARCH_COMPAT:
+		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
+		break;
+	case KVM_REG_PPC_DEC_EXPIRY:
+		*val = get_reg_val(id, vcpu->arch.dec_expires);
+		break;
+	case KVM_REG_PPC_ONLINE:
+		*val = get_reg_val(id, vcpu->arch.online);
+		break;
+	case KVM_REG_PPC_PTCR:
+		*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
+		break;
+	case KVM_REG_PPC_FSCR:
+		*val = get_reg_val(id, kvmppc_get_fscr_hv(vcpu));
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+	unsigned long addr, len;
+
+	switch (id) {
+	case KVM_REG_PPC_HIOR:
+		/* Only allow this to be set to zero */
+		if (set_reg_val(id, *val))
+			r = -EINVAL;
+		break;
+	case KVM_REG_PPC_DABR:
+		vcpu->arch.dabr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DABRX:
+		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
+		break;
+	case KVM_REG_PPC_DSCR:
+		kvmppc_set_dscr_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_PURR:
+		kvmppc_set_purr_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_SPURR:
+		kvmppc_set_spurr_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_AMR:
+		kvmppc_set_amr_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_UAMOR:
+		kvmppc_set_uamor_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
+		i = id - KVM_REG_PPC_MMCR0;
+		kvmppc_set_mmcr_hv(vcpu, i, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_MMCR2:
+		kvmppc_set_mmcr_hv(vcpu, 2, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_MMCRA:
+		kvmppc_set_mmcra_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_MMCRS:
+		vcpu->arch.mmcrs = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MMCR3:
+		*val = get_reg_val(id, vcpu->arch.mmcr[3]);
+		break;
+	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+		i = id - KVM_REG_PPC_PMC1;
+		kvmppc_set_pmc_hv(vcpu, i, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+		i = id - KVM_REG_PPC_SPMC1;
+		vcpu->arch.spmc[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SIAR:
+		kvmppc_set_siar_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_SDAR:
+		kvmppc_set_sdar_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_SIER:
+		kvmppc_set_sier_hv(vcpu, 0, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_SIER2:
+		kvmppc_set_sier_hv(vcpu, 1, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_SIER3:
+		kvmppc_set_sier_hv(vcpu, 2, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_IAMR:
+		kvmppc_set_iamr_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_PSPB:
+		kvmppc_set_pspb_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_DPDES:
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
+		else
+			vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_VTB:
+		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAWR:
+		kvmppc_set_dawr0_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_DAWRX:
+		kvmppc_set_dawrx0_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
+		break;
+	case KVM_REG_PPC_DAWR1:
+		kvmppc_set_dawr1_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_DAWRX1:
+		kvmppc_set_dawrx1_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
+		break;
+	case KVM_REG_PPC_CIABR:
+		kvmppc_set_ciabr_hv(vcpu, set_reg_val(id, *val));
+		/* Don't allow setting breakpoints in hypervisor code */
+		if ((kvmppc_get_ciabr_hv(vcpu) & CIABR_PRIV) == CIABR_PRIV_HYPER)
+			kvmppc_set_ciabr_hv(vcpu, kvmppc_get_ciabr_hv(vcpu) & ~CIABR_PRIV);
+		break;
+	case KVM_REG_PPC_CSIGR:
+		vcpu->arch.csigr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TACR:
+		vcpu->arch.tacr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TCSCR:
+		vcpu->arch.tcscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PID:
+		vcpu->arch.pid = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_ACOP:
+		vcpu->arch.acop = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_WORT:
+		kvmppc_set_wort_hv(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_TIDR:
+		vcpu->arch.tid = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PSSCR:
+		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
+		break;
+	case KVM_REG_PPC_VPA_ADDR:
+		addr = set_reg_val(id, *val);
+		r = -EINVAL;
+		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
+			      vcpu->arch.dtl.next_gpa))
+			break;
+		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
+		break;
+	case KVM_REG_PPC_VPA_SLB:
+		addr = val->vpaval.addr;
+		len = val->vpaval.length;
+		r = -EINVAL;
+		if (addr && !vcpu->arch.vpa.next_gpa)
+			break;
+		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
+		break;
+	case KVM_REG_PPC_VPA_DTL:
+		addr = val->vpaval.addr;
+		len = val->vpaval.length;
+		r = -EINVAL;
+		if (addr && (len < sizeof(struct dtl_entry) ||
+			     !vcpu->arch.vpa.next_gpa))
+			break;
+		len -= len % sizeof(struct dtl_entry);
+		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
+		break;
+	case KVM_REG_PPC_TB_OFFSET:
+	{
+		/* round up to multiple of 2^24 */
+		u64 tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24);
+
+		/*
+		 * Now that we know the timebase offset, update the
+		 * decrementer expiry with a guest timebase value. If
+		 * the userspace does not set DEC_EXPIRY, this ensures
+		 * a migrated vcpu at least starts with an expired
+		 * decrementer, which is better than a large one that
+		 * causes a hang.
+		 */
+		if (!vcpu->arch.dec_expires && tb_offset)
+			vcpu->arch.dec_expires = get_tb() + tb_offset;
+
+		vcpu->arch.vcore->tb_offset = tb_offset;
+		break;
+	}
+	case KVM_REG_PPC_LPCR:
+		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
+		break;
+	case KVM_REG_PPC_LPCR_64:
+		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
+		break;
+	case KVM_REG_PPC_PPR:
+		kvmppc_set_ppr_hv(vcpu, set_reg_val(id, *val));
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_REG_PPC_TFHAR:
+		vcpu->arch.tfhar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		vcpu->arch.tfiar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		vcpu->arch.texasr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+		i = id - KVM_REG_PPC_TM_GPR0;
+		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+	{
+		int j;
+		i = id - KVM_REG_PPC_TM_VSR0;
+		if (i < 32)
+			for (j = 0; j < TS_FPRWIDTH; j++)
+				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+		else
+			if (cpu_has_feature(CPU_FTR_ALTIVEC))
+				vcpu->arch.vr_tm.vr[i-32] = val->vval;
+			else
+				r = -ENXIO;
+		break;
+	}
+	case KVM_REG_PPC_TM_CR:
+		vcpu->arch.cr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_XER:
+		vcpu->arch.xer_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_LR:
+		vcpu->arch.lr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_CTR:
+		vcpu->arch.ctr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_FPSCR:
+		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_AMR:
+		vcpu->arch.amr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_PPR:
+		vcpu->arch.ppr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VRSAVE:
+		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VSCR:
+		if (cpu_has_feature(CPU_FTR_ALTIVEC))
+			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+		else
+			r = - ENXIO;
+		break;
+	case KVM_REG_PPC_TM_DSCR:
+		vcpu->arch.dscr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_TAR:
+		vcpu->arch.tar_tm = set_reg_val(id, *val);
+		break;
+#endif
+	case KVM_REG_PPC_ARCH_COMPAT:
+		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
+		break;
+	case KVM_REG_PPC_DEC_EXPIRY:
+		vcpu->arch.dec_expires = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_ONLINE:
+		i = set_reg_val(id, *val);
+		if (i && !vcpu->arch.online)
+			atomic_inc(&vcpu->arch.vcore->online_count);
+		else if (!i && vcpu->arch.online)
+			atomic_dec(&vcpu->arch.vcore->online_count);
+		vcpu->arch.online = i;
+		break;
+	case KVM_REG_PPC_PTCR:
+		vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_FSCR:
+		kvmppc_set_fscr_hv(vcpu, set_reg_val(id, *val));
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+/*
+ * On POWER9, threads are independent and can be in different partitions.
+ * Therefore we consider each thread to be a subcore.
+ * There is a restriction that all threads have to be in the same
+ * MMU mode (radix or HPT), unfortunately, but since we only support
+ * HPT guests on a HPT host so far, that isn't an impediment yet.
+ */
+static int threads_per_vcore(struct kvm *kvm)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return 1;
+	return threads_per_subcore;
+}
+
+static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
+{
+	struct kvmppc_vcore *vcore;
+
+	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+
+	if (vcore == NULL)
+		return NULL;
+
+	spin_lock_init(&vcore->lock);
+	spin_lock_init(&vcore->stoltb_lock);
+	rcuwait_init(&vcore->wait);
+	vcore->preempt_tb = TB_NIL;
+	vcore->lpcr = kvm->arch.lpcr;
+	vcore->first_vcpuid = id;
+	vcore->kvm = kvm;
+	INIT_LIST_HEAD(&vcore->preempt_list);
+
+	return vcore;
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+	const char *name;
+	size_t offset;
+} timings[] = {
+#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING
+	{"vcpu_entry",	offsetof(struct kvm_vcpu, arch.vcpu_entry)},
+	{"guest_entry",	offsetof(struct kvm_vcpu, arch.guest_entry)},
+	{"in_guest",	offsetof(struct kvm_vcpu, arch.in_guest)},
+	{"guest_exit",	offsetof(struct kvm_vcpu, arch.guest_exit)},
+	{"vcpu_exit",	offsetof(struct kvm_vcpu, arch.vcpu_exit)},
+	{"hypercall",	offsetof(struct kvm_vcpu, arch.hcall)},
+	{"page_fault",	offsetof(struct kvm_vcpu, arch.pg_fault)},
+#else
+	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
+	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
+	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
+	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
+	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
+#endif
+};
+
+#define N_TIMINGS	(ARRAY_SIZE(timings))
+
+struct debugfs_timings_state {
+	struct kvm_vcpu	*vcpu;
+	unsigned int	buflen;
+	char		buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+	struct kvm_vcpu *vcpu = inode->i_private;
+	struct debugfs_timings_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(vcpu->kvm);
+	p->vcpu = vcpu;
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_timings_state *p = file->private_data;
+
+	kvm_put_kvm(p->vcpu->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos)
+{
+	struct debugfs_timings_state *p = file->private_data;
+	struct kvm_vcpu *vcpu = p->vcpu;
+	char *s, *buf_end;
+	struct kvmhv_tb_accumulator tb;
+	u64 count;
+	loff_t pos;
+	ssize_t n;
+	int i, loops;
+	bool ok;
+
+	if (!p->buflen) {
+		s = p->buf;
+		buf_end = s + sizeof(p->buf);
+		for (i = 0; i < N_TIMINGS; ++i) {
+			struct kvmhv_tb_accumulator *acc;
+
+			acc = (struct kvmhv_tb_accumulator *)
+				((unsigned long)vcpu + timings[i].offset);
+			ok = false;
+			for (loops = 0; loops < 1000; ++loops) {
+				count = acc->seqcount;
+				if (!(count & 1)) {
+					smp_rmb();
+					tb = *acc;
+					smp_rmb();
+					if (count == acc->seqcount) {
+						ok = true;
+						break;
+					}
+				}
+				udelay(1);
+			}
+			if (!ok)
+				snprintf(s, buf_end - s, "%s: stuck\n",
+					timings[i].name);
+			else
+				snprintf(s, buf_end - s,
+					"%s: %llu %llu %llu %llu\n",
+					timings[i].name, count / 2,
+					tb_to_ns(tb.tb_total),
+					tb_to_ns(tb.tb_min),
+					tb_to_ns(tb.tb_max));
+			s += strlen(s);
+		}
+		p->buflen = s - p->buf;
+	}
+
+	pos = *ppos;
+	if (pos >= p->buflen)
+		return 0;
+	if (len > p->buflen - pos)
+		len = p->buflen - pos;
+	n = copy_to_user(buf, p->buf + pos, len);
+	if (n) {
+		if (n == len)
+			return -EFAULT;
+		len -= n;
+	}
+	*ppos = pos + len;
+	return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+				     size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_timings_open,
+	.release = debugfs_timings_release,
+	.read	 = debugfs_timings_read,
+	.write	 = debugfs_timings_write,
+	.llseek	 = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_300) == IS_ENABLED(CONFIG_KVM_BOOK3S_HV_P9_TIMING))
+		debugfs_create_file("timings", 0444, debugfs_dentry, vcpu,
+				    &debugfs_timings_ops);
+	return 0;
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
+{
+	return 0;
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
+static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
+{
+	int err;
+	int core;
+	struct kvmppc_vcore *vcore;
+	struct kvm *kvm;
+	unsigned int id;
+
+	kvm = vcpu->kvm;
+	id = vcpu->vcpu_id;
+
+	vcpu->arch.shared = &vcpu->arch.shregs;
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	/*
+	 * The shared struct is never shared on HV,
+	 * so we can always use host endianness
+	 */
+#ifdef __BIG_ENDIAN__
+	vcpu->arch.shared_big_endian = true;
+#else
+	vcpu->arch.shared_big_endian = false;
+#endif
+#endif
+	kvmppc_set_mmcr_hv(vcpu, 0, MMCR0_FC);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+		kvmppc_set_mmcr_hv(vcpu, 0, kvmppc_get_mmcr_hv(vcpu, 0) | MMCR0_PMCCEXT);
+		kvmppc_set_mmcra_hv(vcpu, MMCRA_BHRB_DISABLE);
+	}
+
+	kvmppc_set_ctrl_hv(vcpu, CTRL_RUNLATCH);
+	/* default to host PVR, since we can't spoof it */
+	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
+	spin_lock_init(&vcpu->arch.vpa_update_lock);
+	spin_lock_init(&vcpu->arch.tbacct_lock);
+	vcpu->arch.busy_preempt = TB_NIL;
+	__kvmppc_set_msr_hv(vcpu, MSR_ME);
+	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
+
+	/*
+	 * Set the default HFSCR for the guest from the host value.
+	 * This value is only used on POWER9 and later.
+	 * On >= POWER9, we want to virtualize the doorbell facility, so we
+	 * don't set the HFSCR_MSGP bit, and that causes those instructions
+	 * to trap and then we emulate them.
+	 */
+	kvmppc_set_hfscr_hv(vcpu, HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
+			    HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP);
+
+	/* On POWER10 and later, allow prefixed instructions */
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PREFIX);
+
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & mfspr(SPRN_HFSCR));
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+		if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+			kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM);
+#endif
+	}
+	if (cpu_has_feature(CPU_FTR_TM_COMP))
+		vcpu->arch.hfscr |= HFSCR_TM;
+
+	vcpu->arch.hfscr_permitted = kvmppc_get_hfscr_hv(vcpu);
+
+	/*
+	 * PM, EBB, TM are demand-faulted so start with it clear.
+	 */
+	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM));
+
+	kvmppc_mmu_book3s_hv_init(vcpu);
+
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
+
+	init_waitqueue_head(&vcpu->arch.cpu_run);
+
+	mutex_lock(&kvm->lock);
+	vcore = NULL;
+	err = -EINVAL;
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
+			pr_devel("KVM: VCPU ID too high\n");
+			core = KVM_MAX_VCORES;
+		} else {
+			BUG_ON(kvm->arch.smt_mode != 1);
+			core = kvmppc_pack_vcpu_id(kvm, id);
+		}
+	} else {
+		core = id / kvm->arch.smt_mode;
+	}
+	if (core < KVM_MAX_VCORES) {
+		vcore = kvm->arch.vcores[core];
+		if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
+			pr_devel("KVM: collision on id %u", id);
+			vcore = NULL;
+		} else if (!vcore) {
+			/*
+			 * Take mmu_setup_lock for mutual exclusion
+			 * with kvmppc_update_lpcr().
+			 */
+			err = -ENOMEM;
+			vcore = kvmppc_vcore_create(kvm,
+					id & ~(kvm->arch.smt_mode - 1));
+			mutex_lock(&kvm->arch.mmu_setup_lock);
+			kvm->arch.vcores[core] = vcore;
+			kvm->arch.online_vcores++;
+			mutex_unlock(&kvm->arch.mmu_setup_lock);
+		}
+	}
+	mutex_unlock(&kvm->lock);
+
+	if (!vcore)
+		return err;
+
+	spin_lock(&vcore->lock);
+	++vcore->num_threads;
+	spin_unlock(&vcore->lock);
+	vcpu->arch.vcore = vcore;
+	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
+	vcpu->arch.thread_cpu = -1;
+	vcpu->arch.prev_cpu = -1;
+
+	vcpu->arch.cpu_type = KVM_CPU_3S_64;
+	kvmppc_sanity_check(vcpu);
+
+	return 0;
+}
+
+static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
+			      unsigned long flags)
+{
+	int err;
+	int esmt = 0;
+
+	if (flags)
+		return -EINVAL;
+	if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
+		return -EINVAL;
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/*
+		 * On POWER8 (or POWER7), the threading mode is "strict",
+		 * so we pack smt_mode vcpus per vcore.
+		 */
+		if (smt_mode > threads_per_subcore)
+			return -EINVAL;
+	} else {
+		/*
+		 * On POWER9, the threading mode is "loose",
+		 * so each vcpu gets its own vcore.
+		 */
+		esmt = smt_mode;
+		smt_mode = 1;
+	}
+	mutex_lock(&kvm->lock);
+	err = -EBUSY;
+	if (!kvm->arch.online_vcores) {
+		kvm->arch.smt_mode = smt_mode;
+		kvm->arch.emul_smt_mode = esmt;
+		err = 0;
+	}
+	mutex_unlock(&kvm->lock);
+
+	return err;
+}
+
+static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
+{
+	if (vpa->pinned_addr)
+		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
+					vpa->dirty);
+}
+
+static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+}
+
+static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
+{
+	/* Indicate we want to get back into the guest */
+	return 1;
+}
+
+static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
+{
+	unsigned long dec_nsec, now;
+
+	now = get_tb();
+	if (now > kvmppc_dec_expires_host_tb(vcpu)) {
+		/* decrementer has already gone negative */
+		kvmppc_core_queue_dec(vcpu);
+		kvmppc_core_prepare_to_enter(vcpu);
+		return;
+	}
+	dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
+	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
+	vcpu->arch.timer_running = 1;
+}
+
+extern int __kvmppc_vcore_entry(void);
+
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+				   struct kvm_vcpu *vcpu, u64 tb)
+{
+	u64 now;
+
+	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+	spin_lock_irq(&vcpu->arch.tbacct_lock);
+	now = tb;
+	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
+		vcpu->arch.stolen_logged;
+	vcpu->arch.busy_preempt = now;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	spin_unlock_irq(&vcpu->arch.tbacct_lock);
+	--vc->n_runnable;
+	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
+}
+
+static int kvmppc_grab_hwthread(int cpu)
+{
+	struct paca_struct *tpaca;
+	long timeout = 10000;
+
+	tpaca = paca_ptrs[cpu];
+
+	/* Ensure the thread won't go into the kernel if it wakes */
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.kvm_vcore = NULL;
+	tpaca->kvm_hstate.napping = 0;
+	smp_wmb();
+	tpaca->kvm_hstate.hwthread_req = 1;
+
+	/*
+	 * If the thread is already executing in the kernel (e.g. handling
+	 * a stray interrupt), wait for it to get back to nap mode.
+	 * The smp_mb() is to ensure that our setting of hwthread_req
+	 * is visible before we look at hwthread_state, so if this
+	 * races with the code at system_reset_pSeries and the thread
+	 * misses our setting of hwthread_req, we are sure to see its
+	 * setting of hwthread_state, and vice versa.
+	 */
+	smp_mb();
+	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
+		if (--timeout <= 0) {
+			pr_err("KVM: couldn't grab cpu %d\n", cpu);
+			return -EBUSY;
+		}
+		udelay(1);
+	}
+	return 0;
+}
+
+static void kvmppc_release_hwthread(int cpu)
+{
+	struct paca_struct *tpaca;
+
+	tpaca = paca_ptrs[cpu];
+	tpaca->kvm_hstate.hwthread_req = 0;
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.kvm_vcore = NULL;
+	tpaca->kvm_hstate.kvm_split_mode = NULL;
+}
+
+static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
+
+static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
+{
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	cpumask_t *need_tlb_flush;
+	int i;
+
+	if (nested)
+		need_tlb_flush = &nested->need_tlb_flush;
+	else
+		need_tlb_flush = &kvm->arch.need_tlb_flush;
+
+	cpu = cpu_first_tlb_thread_sibling(cpu);
+	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
+					i += cpu_tlb_thread_sibling_step())
+		cpumask_set_cpu(i, need_tlb_flush);
+
+	/*
+	 * Make sure setting of bit in need_tlb_flush precedes testing of
+	 * cpu_in_guest. The matching barrier on the other side is hwsync
+	 * when switching to guest MMU mode, which happens between
+	 * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
+	 * being tested.
+	 */
+	smp_mb();
+
+	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
+					i += cpu_tlb_thread_sibling_step()) {
+		struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
+
+		if (running == kvm)
+			smp_call_function_single(i, do_nothing, NULL, 1);
+	}
+}
+
+static void do_migrate_away_vcpu(void *arg)
+{
+	struct kvm_vcpu *vcpu = arg;
+	struct kvm *kvm = vcpu->kvm;
+
+	/*
+	 * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
+	 * ptesync sequence on the old CPU before migrating to a new one, in
+	 * case we interrupted the guest between a tlbie ; eieio ;
+	 * tlbsync; ptesync sequence.
+	 *
+	 * Otherwise, ptesync is sufficient for ordering tlbiel sequences.
+	 */
+	if (kvm->arch.lpcr & LPCR_GTSE)
+		asm volatile("eieio; tlbsync; ptesync");
+	else
+		asm volatile("ptesync");
+}
+
+static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
+{
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	struct kvm *kvm = vcpu->kvm;
+	int prev_cpu;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return;
+
+	if (nested)
+		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
+	else
+		prev_cpu = vcpu->arch.prev_cpu;
+
+	/*
+	 * With radix, the guest can do TLB invalidations itself,
+	 * and it could choose to use the local form (tlbiel) if
+	 * it is invalidating a translation that has only ever been
+	 * used on one vcpu.  However, that doesn't mean it has
+	 * only ever been used on one physical cpu, since vcpus
+	 * can move around between pcpus.  To cope with this, when
+	 * a vcpu moves from one pcpu to another, we need to tell
+	 * any vcpus running on the same core as this vcpu previously
+	 * ran to flush the TLB.
+	 */
+	if (prev_cpu != pcpu) {
+		if (prev_cpu >= 0) {
+			if (cpu_first_tlb_thread_sibling(prev_cpu) !=
+			    cpu_first_tlb_thread_sibling(pcpu))
+				radix_flush_cpu(kvm, prev_cpu, vcpu);
+
+			smp_call_function_single(prev_cpu,
+					do_migrate_away_vcpu, vcpu, 1);
+		}
+		if (nested)
+			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
+		else
+			vcpu->arch.prev_cpu = pcpu;
+	}
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
+{
+	int cpu;
+	struct paca_struct *tpaca;
+
+	cpu = vc->pcpu;
+	if (vcpu) {
+		if (vcpu->arch.timer_running) {
+			hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+			vcpu->arch.timer_running = 0;
+		}
+		cpu += vcpu->arch.ptid;
+		vcpu->cpu = vc->pcpu;
+		vcpu->arch.thread_cpu = cpu;
+	}
+	tpaca = paca_ptrs[cpu];
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
+	tpaca->kvm_hstate.fake_suspend = 0;
+	/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
+	smp_wmb();
+	tpaca->kvm_hstate.kvm_vcore = vc;
+	if (cpu != smp_processor_id())
+		kvmppc_ipi_thread(cpu);
+}
+
+static void kvmppc_wait_for_nap(int n_threads)
+{
+	int cpu = smp_processor_id();
+	int i, loops;
+
+	if (n_threads <= 1)
+		return;
+	for (loops = 0; loops < 1000000; ++loops) {
+		/*
+		 * Check if all threads are finished.
+		 * We set the vcore pointer when starting a thread
+		 * and the thread clears it when finished, so we look
+		 * for any threads that still have a non-NULL vcore ptr.
+		 */
+		for (i = 1; i < n_threads; ++i)
+			if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
+				break;
+		if (i == n_threads) {
+			HMT_medium();
+			return;
+		}
+		HMT_low();
+	}
+	HMT_medium();
+	for (i = 1; i < n_threads; ++i)
+		if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
+			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.  Then grab the threads so they can't
+ * enter the kernel.
+ */
+static int on_primary_thread(void)
+{
+	int cpu = smp_processor_id();
+	int thr;
+
+	/* Are we on a primary subcore? */
+	if (cpu_thread_in_subcore(cpu))
+		return 0;
+
+	thr = 0;
+	while (++thr < threads_per_subcore)
+		if (cpu_online(cpu + thr))
+			return 0;
+
+	/* Grab all hw threads so they can't go into the kernel */
+	for (thr = 1; thr < threads_per_subcore; ++thr) {
+		if (kvmppc_grab_hwthread(cpu + thr)) {
+			/* Couldn't grab one; let the others go */
+			do {
+				kvmppc_release_hwthread(cpu + thr);
+			} while (--thr > 0);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * A list of virtual cores for each physical CPU.
+ * These are vcores that could run but their runner VCPU tasks are
+ * (or may be) preempted.
+ */
+struct preempted_vcore_list {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
+
+static void init_vcore_lists(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
+		spin_lock_init(&lp->lock);
+		INIT_LIST_HEAD(&lp->list);
+	}
+}
+
+static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
+{
+	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	vc->vcore_state = VCORE_PREEMPT;
+	vc->pcpu = smp_processor_id();
+	if (vc->num_threads < threads_per_vcore(vc->kvm)) {
+		spin_lock(&lp->lock);
+		list_add_tail(&vc->preempt_list, &lp->list);
+		spin_unlock(&lp->lock);
+	}
+
+	/* Start accumulating stolen time */
+	kvmppc_core_start_stolen(vc, mftb());
+}
+
+static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
+{
+	struct preempted_vcore_list *lp;
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	kvmppc_core_end_stolen(vc, mftb());
+	if (!list_empty(&vc->preempt_list)) {
+		lp = &per_cpu(preempted_vcores, vc->pcpu);
+		spin_lock(&lp->lock);
+		list_del_init(&vc->preempt_list);
+		spin_unlock(&lp->lock);
+	}
+	vc->vcore_state = VCORE_INACTIVE;
+}
+
+/*
+ * This stores information about the virtual cores currently
+ * assigned to a physical core.
+ */
+struct core_info {
+	int		n_subcores;
+	int		max_subcore_threads;
+	int		total_threads;
+	int		subcore_threads[MAX_SUBCORES];
+	struct kvmppc_vcore *vc[MAX_SUBCORES];
+};
+
+/*
+ * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
+ * respectively in 2-way micro-threading (split-core) mode on POWER8.
+ */
+static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
+
+static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
+{
+	memset(cip, 0, sizeof(*cip));
+	cip->n_subcores = 1;
+	cip->max_subcore_threads = vc->num_threads;
+	cip->total_threads = vc->num_threads;
+	cip->subcore_threads[0] = vc->num_threads;
+	cip->vc[0] = vc;
+}
+
+static bool subcore_config_ok(int n_subcores, int n_threads)
+{
+	/*
+	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
+	 * split-core mode, with one thread per subcore.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return n_subcores <= 4 && n_threads == 1;
+
+	/* On POWER8, can only dynamically split if unsplit to begin with */
+	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
+		return false;
+	if (n_subcores > MAX_SUBCORES)
+		return false;
+	if (n_subcores > 1) {
+		if (!(dynamic_mt_modes & 2))
+			n_subcores = 4;
+		if (n_subcores > 2 && !(dynamic_mt_modes & 4))
+			return false;
+	}
+
+	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
+}
+
+static void init_vcore_to_run(struct kvmppc_vcore *vc)
+{
+	vc->entry_exit_map = 0;
+	vc->in_guest = 0;
+	vc->napping_threads = 0;
+	vc->conferring_threads = 0;
+	vc->tb_offset_applied = 0;
+}
+
+static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
+{
+	int n_threads = vc->num_threads;
+	int sub;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return false;
+
+	/* In one_vm_per_core mode, require all vcores to be from the same vm */
+	if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
+		return false;
+
+	if (n_threads < cip->max_subcore_threads)
+		n_threads = cip->max_subcore_threads;
+	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
+		return false;
+	cip->max_subcore_threads = n_threads;
+
+	sub = cip->n_subcores;
+	++cip->n_subcores;
+	cip->total_threads += vc->num_threads;
+	cip->subcore_threads[sub] = vc->num_threads;
+	cip->vc[sub] = vc;
+	init_vcore_to_run(vc);
+	list_del_init(&vc->preempt_list);
+
+	return true;
+}
+
+/*
+ * Work out whether it is possible to piggyback the execution of
+ * vcore *pvc onto the execution of the other vcores described in *cip.
+ */
+static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
+			  int target_threads)
+{
+	if (cip->total_threads + pvc->num_threads > target_threads)
+		return false;
+
+	return can_dynamic_split(pvc, cip);
+}
+
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	for_each_runnable_thread(i, vcpu, vc) {
+		if (signal_pending(vcpu->arch.run_task))
+			vcpu->arch.ret = -EINTR;
+		else if (vcpu->arch.vpa.update_pending ||
+			 vcpu->arch.slb_shadow.update_pending ||
+			 vcpu->arch.dtl.update_pending)
+			vcpu->arch.ret = RESUME_GUEST;
+		else
+			continue;
+		kvmppc_remove_runnable(vc, vcpu, mftb());
+		wake_up(&vcpu->arch.cpu_run);
+	}
+}
+
+static void collect_piggybacks(struct core_info *cip, int target_threads)
+{
+	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+	struct kvmppc_vcore *pvc, *vcnext;
+
+	spin_lock(&lp->lock);
+	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
+		if (!spin_trylock(&pvc->lock))
+			continue;
+		prepare_threads(pvc);
+		if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
+			list_del_init(&pvc->preempt_list);
+			if (pvc->runner == NULL) {
+				pvc->vcore_state = VCORE_INACTIVE;
+				kvmppc_core_end_stolen(pvc, mftb());
+			}
+			spin_unlock(&pvc->lock);
+			continue;
+		}
+		if (!can_piggyback(pvc, cip, target_threads)) {
+			spin_unlock(&pvc->lock);
+			continue;
+		}
+		kvmppc_core_end_stolen(pvc, mftb());
+		pvc->vcore_state = VCORE_PIGGYBACK;
+		if (cip->total_threads >= target_threads)
+			break;
+	}
+	spin_unlock(&lp->lock);
+}
+
+static bool recheck_signals_and_mmu(struct core_info *cip)
+{
+	int sub, i;
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_vcore *vc;
+
+	for (sub = 0; sub < cip->n_subcores; ++sub) {
+		vc = cip->vc[sub];
+		if (!vc->kvm->arch.mmu_ready)
+			return true;
+		for_each_runnable_thread(i, vcpu, vc)
+			if (signal_pending(vcpu->arch.run_task))
+				return true;
+	}
+	return false;
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
+{
+	int still_running = 0, i;
+	u64 now;
+	long ret;
+	struct kvm_vcpu *vcpu;
+
+	spin_lock(&vc->lock);
+	now = get_tb();
+	for_each_runnable_thread(i, vcpu, vc) {
+		/*
+		 * It's safe to unlock the vcore in the loop here, because
+		 * for_each_runnable_thread() is safe against removal of
+		 * the vcpu, and the vcore state is VCORE_EXITING here,
+		 * so any vcpus becoming runnable will have their arch.trap
+		 * set to zero and can't actually run in the guest.
+		 */
+		spin_unlock(&vc->lock);
+		/* cancel pending dec exception if dec is positive */
+		if (now < kvmppc_dec_expires_host_tb(vcpu) &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+
+		trace_kvm_guest_exit(vcpu);
+
+		ret = RESUME_GUEST;
+		if (vcpu->arch.trap)
+			ret = kvmppc_handle_exit_hv(vcpu,
+						    vcpu->arch.run_task);
+
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+
+		spin_lock(&vc->lock);
+		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
+			if (vcpu->arch.pending_exceptions)
+				kvmppc_core_prepare_to_enter(vcpu);
+			if (vcpu->arch.ceded)
+				kvmppc_set_timer(vcpu);
+			else
+				++still_running;
+		} else {
+			kvmppc_remove_runnable(vc, vcpu, mftb());
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+	if (!is_master) {
+		if (still_running > 0) {
+			kvmppc_vcore_preempt(vc);
+		} else if (vc->runner) {
+			vc->vcore_state = VCORE_PREEMPT;
+			kvmppc_core_start_stolen(vc, mftb());
+		} else {
+			vc->vcore_state = VCORE_INACTIVE;
+		}
+		if (vc->n_runnable > 0 && vc->runner == NULL) {
+			/* make sure there's a candidate runner awake */
+			i = -1;
+			vcpu = next_runnable_thread(vc, &i);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+	spin_unlock(&vc->lock);
+}
+
+/*
+ * Clear core from the list of active host cores as we are about to
+ * enter the guest. Only do this if it is the primary thread of the
+ * core (not if a subcore) that is entering the guest.
+ */
+static inline int kvmppc_clear_host_core(unsigned int cpu)
+{
+	int core;
+
+	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+		return 0;
+	/*
+	 * Memory barrier can be omitted here as we will do a smp_wmb()
+	 * later in kvmppc_start_thread and we need ensure that state is
+	 * visible to other CPUs only after we enter guest.
+	 */
+	core = cpu >> threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+	return 0;
+}
+
+/*
+ * Advertise this core as an active host core since we exited the guest
+ * Only need to do this if it is the primary thread of the core that is
+ * exiting.
+ */
+static inline int kvmppc_set_host_core(unsigned int cpu)
+{
+	int core;
+
+	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+		return 0;
+
+	/*
+	 * Memory barrier can be omitted here because we do a spin_unlock
+	 * immediately after this which provides the memory barrier.
+	 */
+	core = cpu >> threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+	return 0;
+}
+
+static void set_irq_happened(int trap)
+{
+	switch (trap) {
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		local_paca->irq_happened |= PACA_IRQ_EE;
+		break;
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+		local_paca->irq_happened |= PACA_IRQ_DBELL;
+		break;
+	case BOOK3S_INTERRUPT_HMI:
+		local_paca->irq_happened |= PACA_IRQ_HMI;
+		break;
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
+		replay_system_reset();
+		break;
+	}
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+	int srcu_idx;
+	struct core_info core_info;
+	struct kvmppc_vcore *pvc;
+	struct kvm_split_mode split_info, *sip;
+	int split, subcore_size, active;
+	int sub;
+	bool thr0_done;
+	unsigned long cmd_bit, stat_bit;
+	int pcpu, thr;
+	int target_threads;
+	int controlled_threads;
+	int trap;
+	bool is_power8;
+
+	if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
+		return;
+
+	/*
+	 * Remove from the list any threads that have a signal pending
+	 * or need a VPA update done
+	 */
+	prepare_threads(vc);
+
+	/* if the runner is no longer runnable, let the caller pick a new one */
+	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+
+	/*
+	 * Initialize *vc.
+	 */
+	init_vcore_to_run(vc);
+	vc->preempt_tb = TB_NIL;
+
+	/*
+	 * Number of threads that we will be controlling: the same as
+	 * the number of threads per subcore, except on POWER9,
+	 * where it's 1 because the threads are (mostly) independent.
+	 */
+	controlled_threads = threads_per_vcore(vc->kvm);
+
+	/*
+	 * Make sure we are running on primary threads, and that secondary
+	 * threads are offline.  Also check if the number of threads in this
+	 * guest are greater than the current system threads per guest.
+	 */
+	if ((controlled_threads > 1) &&
+	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+		for_each_runnable_thread(i, vcpu, vc) {
+			vcpu->arch.ret = -EBUSY;
+			kvmppc_remove_runnable(vc, vcpu, mftb());
+			wake_up(&vcpu->arch.cpu_run);
+		}
+		goto out;
+	}
+
+	/*
+	 * See if we could run any other vcores on the physical core
+	 * along with this one.
+	 */
+	init_core_info(&core_info, vc);
+	pcpu = smp_processor_id();
+	target_threads = controlled_threads;
+	if (target_smt_mode && target_smt_mode < target_threads)
+		target_threads = target_smt_mode;
+	if (vc->num_threads < target_threads)
+		collect_piggybacks(&core_info, target_threads);
+
+	/*
+	 * Hard-disable interrupts, and check resched flag and signals.
+	 * If we need to reschedule or deliver a signal, clean up
+	 * and return without going into the guest(s).
+	 * If the mmu_ready flag has been cleared, don't go into the
+	 * guest because that means a HPT resize operation is in progress.
+	 */
+	local_irq_disable();
+	hard_irq_disable();
+	if (lazy_irq_pending() || need_resched() ||
+	    recheck_signals_and_mmu(&core_info)) {
+		local_irq_enable();
+		vc->vcore_state = VCORE_INACTIVE;
+		/* Unlock all except the primary vcore */
+		for (sub = 1; sub < core_info.n_subcores; ++sub) {
+			pvc = core_info.vc[sub];
+			/* Put back on to the preempted vcores list */
+			kvmppc_vcore_preempt(pvc);
+			spin_unlock(&pvc->lock);
+		}
+		for (i = 0; i < controlled_threads; ++i)
+			kvmppc_release_hwthread(pcpu + i);
+		return;
+	}
+
+	kvmppc_clear_host_core(pcpu);
+
+	/* Decide on micro-threading (split-core) mode */
+	subcore_size = threads_per_subcore;
+	cmd_bit = stat_bit = 0;
+	split = core_info.n_subcores;
+	sip = NULL;
+	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
+
+	if (split > 1) {
+		sip = &split_info;
+		memset(&split_info, 0, sizeof(split_info));
+		for (sub = 0; sub < core_info.n_subcores; ++sub)
+			split_info.vc[sub] = core_info.vc[sub];
+
+		if (is_power8) {
+			if (split == 2 && (dynamic_mt_modes & 2)) {
+				cmd_bit = HID0_POWER8_1TO2LPAR;
+				stat_bit = HID0_POWER8_2LPARMODE;
+			} else {
+				split = 4;
+				cmd_bit = HID0_POWER8_1TO4LPAR;
+				stat_bit = HID0_POWER8_4LPARMODE;
+			}
+			subcore_size = MAX_SMT_THREADS / split;
+			split_info.rpr = mfspr(SPRN_RPR);
+			split_info.pmmar = mfspr(SPRN_PMMAR);
+			split_info.ldbar = mfspr(SPRN_LDBAR);
+			split_info.subcore_size = subcore_size;
+		} else {
+			split_info.subcore_size = 1;
+		}
+
+		/* order writes to split_info before kvm_split_mode pointer */
+		smp_wmb();
+	}
+
+	for (thr = 0; thr < controlled_threads; ++thr) {
+		struct paca_struct *paca = paca_ptrs[pcpu + thr];
+
+		paca->kvm_hstate.napping = 0;
+		paca->kvm_hstate.kvm_split_mode = sip;
+	}
+
+	/* Initiate micro-threading (split-core) on POWER8 if required */
+	if (cmd_bit) {
+		unsigned long hid0 = mfspr(SPRN_HID0);
+
+		hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
+		mb();
+		mtspr(SPRN_HID0, hid0);
+		isync();
+		for (;;) {
+			hid0 = mfspr(SPRN_HID0);
+			if (hid0 & stat_bit)
+				break;
+			cpu_relax();
+		}
+	}
+
+	/*
+	 * On POWER8, set RWMR register.
+	 * Since it only affects PURR and SPURR, it doesn't affect
+	 * the host, so we don't save/restore the host value.
+	 */
+	if (is_power8) {
+		unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
+		int n_online = atomic_read(&vc->online_count);
+
+		/*
+		 * Use the 8-thread value if we're doing split-core
+		 * or if the vcore's online count looks bogus.
+		 */
+		if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
+		    n_online >= 1 && n_online <= MAX_SMT_THREADS)
+			rwmr_val = p8_rwmr_values[n_online];
+		mtspr(SPRN_RWMR, rwmr_val);
+	}
+
+	/* Start all the threads */
+	active = 0;
+	for (sub = 0; sub < core_info.n_subcores; ++sub) {
+		thr = is_power8 ? subcore_thread_map[sub] : sub;
+		thr0_done = false;
+		active |= 1 << thr;
+		pvc = core_info.vc[sub];
+		pvc->pcpu = pcpu + thr;
+		for_each_runnable_thread(i, vcpu, pvc) {
+			/*
+			 * XXX: is kvmppc_start_thread called too late here?
+			 * It updates vcpu->cpu and vcpu->arch.thread_cpu
+			 * which are used by kvmppc_fast_vcpu_kick_hv(), but
+			 * kick is called after new exceptions become available
+			 * and exceptions are checked earlier than here, by
+			 * kvmppc_core_prepare_to_enter.
+			 */
+			kvmppc_start_thread(vcpu, pvc);
+			kvmppc_update_vpa_dispatch(vcpu, pvc);
+			trace_kvm_guest_enter(vcpu);
+			if (!vcpu->arch.ptid)
+				thr0_done = true;
+			active |= 1 << (thr + vcpu->arch.ptid);
+		}
+		/*
+		 * We need to start the first thread of each subcore
+		 * even if it doesn't have a vcpu.
+		 */
+		if (!thr0_done)
+			kvmppc_start_thread(NULL, pvc);
+	}
+
+	/*
+	 * Ensure that split_info.do_nap is set after setting
+	 * the vcore pointer in the PACA of the secondaries.
+	 */
+	smp_mb();
+
+	/*
+	 * When doing micro-threading, poke the inactive threads as well.
+	 * This gets them to the nap instruction after kvm_do_nap,
+	 * which reduces the time taken to unsplit later.
+	 */
+	if (cmd_bit) {
+		split_info.do_nap = 1;	/* ask secondaries to nap when done */
+		for (thr = 1; thr < threads_per_subcore; ++thr)
+			if (!(active & (1 << thr)))
+				kvmppc_ipi_thread(pcpu + thr);
+	}
+
+	vc->vcore_state = VCORE_RUNNING;
+	preempt_disable();
+
+	trace_kvmppc_run_core(vc, 0);
+
+	for (sub = 0; sub < core_info.n_subcores; ++sub)
+		spin_unlock(&core_info.vc[sub]->lock);
+
+	guest_timing_enter_irqoff();
+
+	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
+
+	guest_state_enter_irqoff();
+	this_cpu_disable_ftrace();
+
+	trap = __kvmppc_vcore_entry();
+
+	this_cpu_enable_ftrace();
+	guest_state_exit_irqoff();
+
+	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
+	set_irq_happened(trap);
+
+	spin_lock(&vc->lock);
+	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
+	vc->vcore_state = VCORE_EXITING;
+
+	/* wait for secondary threads to finish writing their state to memory */
+	kvmppc_wait_for_nap(controlled_threads);
+
+	/* Return to whole-core mode if we split the core earlier */
+	if (cmd_bit) {
+		unsigned long hid0 = mfspr(SPRN_HID0);
+		unsigned long loops = 0;
+
+		hid0 &= ~HID0_POWER8_DYNLPARDIS;
+		stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+		mb();
+		mtspr(SPRN_HID0, hid0);
+		isync();
+		for (;;) {
+			hid0 = mfspr(SPRN_HID0);
+			if (!(hid0 & stat_bit))
+				break;
+			cpu_relax();
+			++loops;
+		}
+		split_info.do_nap = 0;
+	}
+
+	kvmppc_set_host_core(pcpu);
+
+	if (!vtime_accounting_enabled_this_cpu()) {
+		local_irq_enable();
+		/*
+		 * Service IRQs here before guest_timing_exit_irqoff() so any
+		 * ticks that occurred while running the guest are accounted to
+		 * the guest. If vtime accounting is enabled, accounting uses
+		 * TB rather than ticks, so it can be done without enabling
+		 * interrupts here, which has the problem that it accounts
+		 * interrupt processing overhead to the host.
+		 */
+		local_irq_disable();
+	}
+	guest_timing_exit_irqoff();
+
+	local_irq_enable();
+
+	/* Let secondaries go back to the offline loop */
+	for (i = 0; i < controlled_threads; ++i) {
+		kvmppc_release_hwthread(pcpu + i);
+		if (sip && sip->napped[i])
+			kvmppc_ipi_thread(pcpu + i);
+	}
+
+	spin_unlock(&vc->lock);
+
+	/* make sure updates to secondary vcpu structs are visible now */
+	smp_mb();
+
+	preempt_enable();
+
+	for (sub = 0; sub < core_info.n_subcores; ++sub) {
+		pvc = core_info.vc[sub];
+		post_guest_process(pvc, pvc == vc);
+	}
+
+	spin_lock(&vc->lock);
+
+ out:
+	vc->vcore_state = VCORE_INACTIVE;
+	trace_kvmppc_run_core(vc, 1);
+}
+
+static inline bool hcall_is_xics(unsigned long req)
+{
+	return req == H_EOI || req == H_CPPR || req == H_IPI ||
+		req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
+}
+
+static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
+{
+	struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+	if (lp) {
+		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+		lp->yield_count = cpu_to_be32(yield_count);
+		vcpu->arch.vpa.dirty = 1;
+	}
+}
+
+/* call our hypervisor to load up HV regs and go */
+static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	unsigned long host_psscr;
+	unsigned long msr;
+	struct hv_guest_state hvregs;
+	struct p9_host_os_sprs host_os_sprs;
+	s64 dec;
+	int trap;
+
+	msr = mfmsr();
+
+	save_p9_host_os_sprs(&host_os_sprs);
+
+	/*
+	 * We need to save and restore the guest visible part of the
+	 * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
+	 * doesn't do this for us. Note only required if pseries since
+	 * this is done in kvmhv_vcpu_entry_p9() below otherwise.
+	 */
+	host_psscr = mfspr(SPRN_PSSCR_PR);
+
+	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
+	if (lazy_irq_pending())
+		return 0;
+
+	if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
+		msr = mfmsr(); /* TM restore can update msr */
+
+	if (vcpu->arch.psscr != host_psscr)
+		mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+
+	kvmhv_save_hv_regs(vcpu, &hvregs);
+	hvregs.lpcr = lpcr;
+	hvregs.amor = ~0;
+	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+	hvregs.version = HV_GUEST_STATE_VERSION;
+	if (vcpu->arch.nested) {
+		hvregs.lpid = vcpu->arch.nested->shadow_lpid;
+		hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
+	} else {
+		hvregs.lpid = vcpu->kvm->arch.lpid;
+		hvregs.vcpu_token = vcpu->vcpu_id;
+	}
+	hvregs.hdec_expiry = time_limit;
+
+	/*
+	 * When setting DEC, we must always deal with irq_work_raise
+	 * via NMI vs setting DEC. The problem occurs right as we
+	 * switch into guest mode if a NMI hits and sets pending work
+	 * and sets DEC, then that will apply to the guest and not
+	 * bring us back to the host.
+	 *
+	 * irq_work_raise could check a flag (or possibly LPCR[HDICE]
+	 * for example) and set HDEC to 1? That wouldn't solve the
+	 * nested hv case which needs to abort the hcall or zero the
+	 * time limit.
+	 *
+	 * XXX: Another day's problem.
+	 */
+	mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
+
+	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+	switch_pmu_to_guest(vcpu, &host_os_sprs);
+	accumulate_time(vcpu, &vcpu->arch.in_guest);
+	trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+				  __pa(&vcpu->arch.regs));
+	accumulate_time(vcpu, &vcpu->arch.guest_exit);
+	kvmhv_restore_hv_return_state(vcpu, &hvregs);
+	switch_pmu_to_host(vcpu, &host_os_sprs);
+	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+	vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+	vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+	vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
+
+	store_vcpu_state(vcpu);
+
+	dec = mfspr(SPRN_DEC);
+	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
+		dec = (s32) dec;
+	*tb = mftb();
+	vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset);
+
+	timer_rearm_host_dec(*tb);
+
+	restore_p9_host_os_sprs(vcpu, &host_os_sprs);
+	if (vcpu->arch.psscr != host_psscr)
+		mtspr(SPRN_PSSCR_PR, host_psscr);
+
+	return trap;
+}
+
+/*
+ * Guest entry for POWER9 and later CPUs.
+ */
+static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+			 unsigned long lpcr, u64 *tb)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	u64 next_timer;
+	int trap;
+
+	next_timer = timer_get_next_tb();
+	if (*tb >= next_timer)
+		return BOOK3S_INTERRUPT_HV_DECREMENTER;
+	if (next_timer < time_limit)
+		time_limit = next_timer;
+	else if (*tb >= time_limit) /* nested time limit */
+		return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
+
+	vcpu->arch.ceded = 0;
+
+	vcpu_vpa_increment_dispatch(vcpu);
+
+	if (kvmhv_on_pseries()) {
+		trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
+
+		/* H_CEDE has to be handled now, not later */
+		if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
+		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
+			kvmppc_cede(vcpu);
+			kvmppc_set_gpr(vcpu, 3, 0);
+			trap = 0;
+		}
+
+	} else if (nested) {
+		__this_cpu_write(cpu_in_guest, kvm);
+		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
+		__this_cpu_write(cpu_in_guest, NULL);
+
+	} else {
+		kvmppc_xive_push_vcpu(vcpu);
+
+		__this_cpu_write(cpu_in_guest, kvm);
+		trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
+		__this_cpu_write(cpu_in_guest, NULL);
+
+		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
+		    !(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
+			unsigned long req = kvmppc_get_gpr(vcpu, 3);
+
+			/*
+			 * XIVE rearm and XICS hcalls must be handled
+			 * before xive context is pulled (is this
+			 * true?)
+			 */
+			if (req == H_CEDE) {
+				/* H_CEDE has to be handled now */
+				kvmppc_cede(vcpu);
+				if (!kvmppc_xive_rearm_escalation(vcpu)) {
+					/*
+					 * Pending escalation so abort
+					 * the cede.
+					 */
+					vcpu->arch.ceded = 0;
+				}
+				kvmppc_set_gpr(vcpu, 3, 0);
+				trap = 0;
+
+			} else if (req == H_ENTER_NESTED) {
+				/*
+				 * L2 should not run with the L1
+				 * context so rearm and pull it.
+				 */
+				if (!kvmppc_xive_rearm_escalation(vcpu)) {
+					/*
+					 * Pending escalation so abort
+					 * H_ENTER_NESTED.
+					 */
+					kvmppc_set_gpr(vcpu, 3, 0);
+					trap = 0;
+				}
+
+			} else if (hcall_is_xics(req)) {
+				int ret;
+
+				ret = kvmppc_xive_xics_hcall(vcpu, req);
+				if (ret != H_TOO_HARD) {
+					kvmppc_set_gpr(vcpu, 3, ret);
+					trap = 0;
+				}
+			}
+		}
+		kvmppc_xive_pull_vcpu(vcpu);
+
+		if (kvm_is_radix(kvm))
+			vcpu->arch.slb_max = 0;
+	}
+
+	vcpu_vpa_increment_dispatch(vcpu);
+
+	return trap;
+}
+
+/*
+ * Wait for some other vcpu thread to execute us, and
+ * wake us up when we need to handle something in the host.
+ */
+static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
+				 struct kvm_vcpu *vcpu, int wait_state)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		spin_unlock(&vc->lock);
+		schedule();
+		spin_lock(&vc->lock);
+	}
+	finish_wait(&vcpu->arch.cpu_run, &wait);
+}
+
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+	if (!halt_poll_ns_grow)
+		return;
+
+	vc->halt_poll_ns *= halt_poll_ns_grow;
+	if (vc->halt_poll_ns < halt_poll_ns_grow_start)
+		vc->halt_poll_ns = halt_poll_ns_grow_start;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+	if (halt_poll_ns_shrink == 0)
+		vc->halt_poll_ns = 0;
+	else
+		vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+#ifdef CONFIG_KVM_XICS
+static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
+{
+	if (!xics_on_xive())
+		return false;
+	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
+		vcpu->arch.xive_saved_state.cppr;
+}
+#else
+static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+#endif /* CONFIG_KVM_XICS */
+
+static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
+	    kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
+		return true;
+
+	return false;
+}
+
+static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
+		return true;
+	return false;
+}
+
+/*
+ * Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	for_each_runnable_thread(i, vcpu, vc) {
+		if (kvmppc_vcpu_check_block(vcpu))
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * All the vcpus in this vcore are idle, so wait for a decrementer
+ * or external interrupt to one of the vcpus.  vc->lock is held.
+ */
+static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
+{
+	ktime_t cur, start_poll, start_wait;
+	int do_sleep = 1;
+	u64 block_ns;
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	/* Poll for pending exceptions and ceded state */
+	cur = start_poll = ktime_get();
+	if (vc->halt_poll_ns) {
+		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+		++vc->runner->stat.generic.halt_attempted_poll;
+
+		vc->vcore_state = VCORE_POLLING;
+		spin_unlock(&vc->lock);
+
+		do {
+			if (kvmppc_vcore_check_block(vc)) {
+				do_sleep = 0;
+				break;
+			}
+			cur = ktime_get();
+		} while (kvm_vcpu_can_poll(cur, stop));
+
+		spin_lock(&vc->lock);
+		vc->vcore_state = VCORE_INACTIVE;
+
+		if (!do_sleep) {
+			++vc->runner->stat.generic.halt_successful_poll;
+			goto out;
+		}
+	}
+
+	prepare_to_rcuwait(&vc->wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (kvmppc_vcore_check_block(vc)) {
+		finish_rcuwait(&vc->wait);
+		do_sleep = 0;
+		/* If we polled, count this as a successful poll */
+		if (vc->halt_poll_ns)
+			++vc->runner->stat.generic.halt_successful_poll;
+		goto out;
+	}
+
+	start_wait = ktime_get();
+
+	vc->vcore_state = VCORE_SLEEPING;
+	trace_kvmppc_vcore_blocked(vc->runner, 0);
+	spin_unlock(&vc->lock);
+	schedule();
+	finish_rcuwait(&vc->wait);
+	spin_lock(&vc->lock);
+	vc->vcore_state = VCORE_INACTIVE;
+	trace_kvmppc_vcore_blocked(vc->runner, 1);
+	++vc->runner->stat.halt_successful_wait;
+
+	cur = ktime_get();
+
+out:
+	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+	/* Attribute wait time */
+	if (do_sleep) {
+		vc->runner->stat.generic.halt_wait_ns +=
+			ktime_to_ns(cur) - ktime_to_ns(start_wait);
+		KVM_STATS_LOG_HIST_UPDATE(
+				vc->runner->stat.generic.halt_wait_hist,
+				ktime_to_ns(cur) - ktime_to_ns(start_wait));
+		/* Attribute failed poll time */
+		if (vc->halt_poll_ns) {
+			vc->runner->stat.generic.halt_poll_fail_ns +=
+				ktime_to_ns(start_wait) -
+				ktime_to_ns(start_poll);
+			KVM_STATS_LOG_HIST_UPDATE(
+				vc->runner->stat.generic.halt_poll_fail_hist,
+				ktime_to_ns(start_wait) -
+				ktime_to_ns(start_poll));
+		}
+	} else {
+		/* Attribute successful poll time */
+		if (vc->halt_poll_ns) {
+			vc->runner->stat.generic.halt_poll_success_ns +=
+				ktime_to_ns(cur) -
+				ktime_to_ns(start_poll);
+			KVM_STATS_LOG_HIST_UPDATE(
+				vc->runner->stat.generic.halt_poll_success_hist,
+				ktime_to_ns(cur) - ktime_to_ns(start_poll));
+		}
+	}
+
+	/* Adjust poll time */
+	if (halt_poll_ns) {
+		if (block_ns <= vc->halt_poll_ns)
+			;
+		/* We slept and blocked for longer than the max halt time */
+		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
+			shrink_halt_poll_ns(vc);
+		/* We slept and our poll time is too small */
+		else if (vc->halt_poll_ns < halt_poll_ns &&
+				block_ns < halt_poll_ns)
+			grow_halt_poll_ns(vc);
+		if (vc->halt_poll_ns > halt_poll_ns)
+			vc->halt_poll_ns = halt_poll_ns;
+	} else
+		vc->halt_poll_ns = 0;
+
+	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
+}
+
+/*
+ * This never fails for a radix guest, as none of the operations it does
+ * for a radix guest can fail or have a way to report failure.
+ */
+static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+	struct kvm *kvm = vcpu->kvm;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+	if (!kvm->arch.mmu_ready) {
+		if (!kvm_is_radix(kvm))
+			r = kvmppc_hv_setup_htab_rma(vcpu);
+		if (!r) {
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				kvmppc_setup_partition_table(kvm);
+			kvm->arch.mmu_ready = 1;
+		}
+	}
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+	return r;
+}
+
+static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	int n_ceded, i, r;
+	struct kvmppc_vcore *vc;
+	struct kvm_vcpu *v;
+
+	trace_kvmppc_run_vcpu_enter(vcpu);
+
+	run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+	kvmppc_update_vpas(vcpu);
+
+	/*
+	 * Synchronize with other threads in this virtual core
+	 */
+	vc = vcpu->arch.vcore;
+	spin_lock(&vc->lock);
+	vcpu->arch.ceded = 0;
+	vcpu->arch.run_task = current;
+	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	vcpu->arch.busy_preempt = TB_NIL;
+	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
+	++vc->n_runnable;
+
+	/*
+	 * This happens the first time this is called for a vcpu.
+	 * If the vcore is already running, we may be able to start
+	 * this thread straight away and have it join in.
+	 */
+	if (!signal_pending(current)) {
+		if ((vc->vcore_state == VCORE_PIGGYBACK ||
+		     vc->vcore_state == VCORE_RUNNING) &&
+			   !VCORE_IS_EXITING(vc)) {
+			kvmppc_update_vpa_dispatch(vcpu, vc);
+			kvmppc_start_thread(vcpu, vc);
+			trace_kvm_guest_enter(vcpu);
+		} else if (vc->vcore_state == VCORE_SLEEPING) {
+		        rcuwait_wake_up(&vc->wait);
+		}
+
+	}
+
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       !signal_pending(current)) {
+		/* See if the MMU is ready to go */
+		if (!vcpu->kvm->arch.mmu_ready) {
+			spin_unlock(&vc->lock);
+			r = kvmhv_setup_mmu(vcpu);
+			spin_lock(&vc->lock);
+			if (r) {
+				run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+				run->fail_entry.
+					hardware_entry_failure_reason = 0;
+				vcpu->arch.ret = r;
+				break;
+			}
+		}
+
+		if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
+			kvmppc_vcore_end_preempt(vc);
+
+		if (vc->vcore_state != VCORE_INACTIVE) {
+			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
+			continue;
+		}
+		for_each_runnable_thread(i, v, vc) {
+			kvmppc_core_prepare_to_enter(v);
+			if (signal_pending(v->arch.run_task)) {
+				kvmppc_remove_runnable(vc, v, mftb());
+				v->stat.signal_exits++;
+				v->run->exit_reason = KVM_EXIT_INTR;
+				v->arch.ret = -EINTR;
+				wake_up(&v->arch.cpu_run);
+			}
+		}
+		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+			break;
+		n_ceded = 0;
+		for_each_runnable_thread(i, v, vc) {
+			if (!kvmppc_vcpu_woken(v))
+				n_ceded += v->arch.ceded;
+			else
+				v->arch.ceded = 0;
+		}
+		vc->runner = vcpu;
+		if (n_ceded == vc->n_runnable) {
+			kvmppc_vcore_blocked(vc);
+		} else if (need_resched()) {
+			kvmppc_vcore_preempt(vc);
+			/* Let something else run */
+			cond_resched_lock(&vc->lock);
+			if (vc->vcore_state == VCORE_PREEMPT)
+				kvmppc_vcore_end_preempt(vc);
+		} else {
+			kvmppc_run_core(vc);
+		}
+		vc->runner = NULL;
+	}
+
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       (vc->vcore_state == VCORE_RUNNING ||
+		vc->vcore_state == VCORE_EXITING ||
+		vc->vcore_state == VCORE_PIGGYBACK))
+		kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
+
+	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
+		kvmppc_vcore_end_preempt(vc);
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		kvmppc_remove_runnable(vc, vcpu, mftb());
+		vcpu->stat.signal_exits++;
+		run->exit_reason = KVM_EXIT_INTR;
+		vcpu->arch.ret = -EINTR;
+	}
+
+	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
+		/* Wake up some vcpu to run the core */
+		i = -1;
+		v = next_runnable_thread(vc, &i);
+		wake_up(&v->arch.cpu_run);
+	}
+
+	trace_kvmppc_run_vcpu_exit(vcpu);
+	spin_unlock(&vc->lock);
+	return vcpu->arch.ret;
+}
+
+int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
+			  unsigned long lpcr)
+{
+	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
+	struct kvm_run *run = vcpu->run;
+	int trap, r, pcpu;
+	int srcu_idx;
+	struct kvmppc_vcore *vc;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	unsigned long flags;
+	u64 tb;
+
+	trace_kvmppc_run_vcpu_enter(vcpu);
+
+	run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+
+	vc = vcpu->arch.vcore;
+	vcpu->arch.ceded = 0;
+	vcpu->arch.run_task = current;
+	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
+
+	/* See if the MMU is ready to go */
+	if (unlikely(!kvm->arch.mmu_ready)) {
+		r = kvmhv_setup_mmu(vcpu);
+		if (r) {
+			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+			run->fail_entry.hardware_entry_failure_reason = 0;
+			vcpu->arch.ret = r;
+			return r;
+		}
+	}
+
+	if (need_resched())
+		cond_resched();
+
+	kvmppc_update_vpas(vcpu);
+
+	preempt_disable();
+	pcpu = smp_processor_id();
+	if (kvm_is_radix(kvm))
+		kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+
+	/* flags save not required, but irq_pmu has no disable/enable API */
+	powerpc_local_irq_pmu_save(flags);
+
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+
+	if (signal_pending(current))
+		goto sigpend;
+	if (need_resched() || !kvm->arch.mmu_ready)
+		goto out;
+
+	vcpu->cpu = pcpu;
+	vcpu->arch.thread_cpu = pcpu;
+	vc->pcpu = pcpu;
+	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+	local_paca->kvm_hstate.ptid = 0;
+	local_paca->kvm_hstate.fake_suspend = 0;
+
+	/*
+	 * Orders set cpu/thread_cpu vs testing for pending interrupts and
+	 * doorbells below. The other side is when these fields are set vs
+	 * kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to
+	 * kick a vCPU to notice the pending interrupt.
+	 */
+	smp_mb();
+
+	if (!nested) {
+		kvmppc_core_prepare_to_enter(vcpu);
+		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+			     &vcpu->arch.pending_exceptions) ||
+		    xive_interrupt_pending(vcpu)) {
+			/*
+			 * For nested HV, don't synthesize but always pass MER,
+			 * the L0 will be able to optimise that more
+			 * effectively than manipulating registers directly.
+			 */
+			if (!kvmhv_on_pseries() && (__kvmppc_get_msr_hv(vcpu) & MSR_EE))
+				kvmppc_inject_interrupt_hv(vcpu,
+							   BOOK3S_INTERRUPT_EXTERNAL, 0);
+			else
+				lpcr |= LPCR_MER;
+		}
+	} else if (vcpu->arch.pending_exceptions ||
+		   vcpu->arch.doorbell_request ||
+		   xive_interrupt_pending(vcpu)) {
+		vcpu->arch.ret = RESUME_HOST;
+		goto out;
+	}
+
+	if (vcpu->arch.timer_running) {
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		vcpu->arch.timer_running = 0;
+	}
+
+	tb = mftb();
+
+	kvmppc_update_vpa_dispatch_p9(vcpu, vc, tb + vc->tb_offset);
+
+	trace_kvm_guest_enter(vcpu);
+
+	guest_timing_enter_irqoff();
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	guest_state_enter_irqoff();
+	this_cpu_disable_ftrace();
+
+	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb);
+	vcpu->arch.trap = trap;
+
+	this_cpu_enable_ftrace();
+	guest_state_exit_irqoff();
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	set_irq_happened(trap);
+
+	vcpu->cpu = -1;
+	vcpu->arch.thread_cpu = -1;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+
+	if (!vtime_accounting_enabled_this_cpu()) {
+		powerpc_local_irq_pmu_restore(flags);
+		/*
+		 * Service IRQs here before guest_timing_exit_irqoff() so any
+		 * ticks that occurred while running the guest are accounted to
+		 * the guest. If vtime accounting is enabled, accounting uses
+		 * TB rather than ticks, so it can be done without enabling
+		 * interrupts here, which has the problem that it accounts
+		 * interrupt processing overhead to the host.
+		 */
+		powerpc_local_irq_pmu_save(flags);
+	}
+	guest_timing_exit_irqoff();
+
+	powerpc_local_irq_pmu_restore(flags);
+
+	preempt_enable();
+
+	/*
+	 * cancel pending decrementer exception if DEC is now positive, or if
+	 * entering a nested guest in which case the decrementer is now owned
+	 * by L2 and the L1 decrementer is provided in hdec_expires
+	 */
+	if (kvmppc_core_pending_dec(vcpu) &&
+			((tb < kvmppc_dec_expires_host_tb(vcpu)) ||
+			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
+			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
+		kvmppc_core_dequeue_dec(vcpu);
+
+	trace_kvm_guest_exit(vcpu);
+	r = RESUME_GUEST;
+	if (trap) {
+		if (!nested)
+			r = kvmppc_handle_exit_hv(vcpu, current);
+		else
+			r = kvmppc_handle_nested_exit(vcpu);
+	}
+	vcpu->arch.ret = r;
+
+	if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
+		kvmppc_set_timer(vcpu);
+
+		prepare_to_rcuwait(wait);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (signal_pending(current)) {
+				vcpu->stat.signal_exits++;
+				run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+				break;
+			}
+
+			if (kvmppc_vcpu_check_block(vcpu))
+				break;
+
+			trace_kvmppc_vcore_blocked(vcpu, 0);
+			schedule();
+			trace_kvmppc_vcore_blocked(vcpu, 1);
+		}
+		finish_rcuwait(wait);
+	}
+	vcpu->arch.ceded = 0;
+
+ done:
+	trace_kvmppc_run_vcpu_exit(vcpu);
+
+	return vcpu->arch.ret;
+
+ sigpend:
+	vcpu->stat.signal_exits++;
+	run->exit_reason = KVM_EXIT_INTR;
+	vcpu->arch.ret = -EINTR;
+ out:
+	vcpu->cpu = -1;
+	vcpu->arch.thread_cpu = -1;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	powerpc_local_irq_pmu_restore(flags);
+	preempt_enable();
+	goto done;
+}
+
+static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	int r;
+	int srcu_idx;
+	struct kvm *kvm;
+	unsigned long msr;
+
+	start_timing(vcpu, &vcpu->arch.vcpu_entry);
+
+	if (!vcpu->arch.sane) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
+	/* No need to go into the guest when all we'll do is come back out */
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * Don't allow entry with a suspended transaction, because
+	 * the guest entry/exit code will lose it.
+	 */
+	if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+	    (current->thread.regs->msr & MSR_TM)) {
+		if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
+			run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+			run->fail_entry.hardware_entry_failure_reason = 0;
+			return -EINVAL;
+		}
+	}
+#endif
+
+	/*
+	 * Force online to 1 for the sake of old userspace which doesn't
+	 * set it.
+	 */
+	if (!vcpu->arch.online) {
+		atomic_inc(&vcpu->arch.vcore->online_count);
+		vcpu->arch.online = 1;
+	}
+
+	kvmppc_core_prepare_to_enter(vcpu);
+
+	kvm = vcpu->kvm;
+	atomic_inc(&kvm->arch.vcpus_running);
+	/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
+	smp_mb();
+
+	msr = 0;
+	if (IS_ENABLED(CONFIG_PPC_FPU))
+		msr |= MSR_FP;
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		msr |= MSR_VEC;
+	if (cpu_has_feature(CPU_FTR_VSX))
+		msr |= MSR_VSX;
+	if ((cpu_has_feature(CPU_FTR_TM) ||
+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
+			(kvmppc_get_hfscr_hv(vcpu) & HFSCR_TM))
+		msr |= MSR_TM;
+	msr = msr_check_and_set(msr);
+
+	kvmppc_save_user_regs();
+
+	kvmppc_save_current_sprs();
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		vcpu->arch.waitp = &vcpu->arch.vcore->wait;
+	vcpu->arch.pgdir = kvm->mm->pgd;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+
+	do {
+		accumulate_time(vcpu, &vcpu->arch.guest_entry);
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
+						  vcpu->arch.vcore->lpcr);
+		else
+			r = kvmppc_run_vcpu(vcpu);
+
+		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
+			accumulate_time(vcpu, &vcpu->arch.hcall);
+
+			if (WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
+				/*
+				 * These should have been caught reflected
+				 * into the guest by now. Final sanity check:
+				 * don't allow userspace to execute hcalls in
+				 * the hypervisor.
+				 */
+				r = RESUME_GUEST;
+				continue;
+			}
+			trace_kvm_hcall_enter(vcpu);
+			r = kvmppc_pseries_do_hcall(vcpu);
+			trace_kvm_hcall_exit(vcpu, r);
+			kvmppc_core_prepare_to_enter(vcpu);
+		} else if (r == RESUME_PAGE_FAULT) {
+			accumulate_time(vcpu, &vcpu->arch.pg_fault);
+			srcu_idx = srcu_read_lock(&kvm->srcu);
+			r = kvmppc_book3s_hv_page_fault(vcpu,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+			srcu_read_unlock(&kvm->srcu, srcu_idx);
+		} else if (r == RESUME_PASSTHROUGH) {
+			if (WARN_ON(xics_on_xive()))
+				r = H_SUCCESS;
+			else
+				r = kvmppc_xics_rm_complete(vcpu, 0);
+		}
+	} while (is_kvmppc_resume_guest(r));
+	accumulate_time(vcpu, &vcpu->arch.vcpu_exit);
+
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
+	atomic_dec(&kvm->arch.vcpus_running);
+
+	srr_regs_clobbered();
+
+	end_timing(vcpu);
+
+	return r;
+}
+
+static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
+				     int shift, int sllp)
+{
+	(*sps)->page_shift = shift;
+	(*sps)->slb_enc = sllp;
+	(*sps)->enc[0].page_shift = shift;
+	(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
+	/*
+	 * Add 16MB MPSS support (may get filtered out by userspace)
+	 */
+	if (shift != 24) {
+		int penc = kvmppc_pgsize_lp_encoding(shift, 24);
+		if (penc != -1) {
+			(*sps)->enc[1].page_shift = 24;
+			(*sps)->enc[1].pte_enc = penc;
+		}
+	}
+	(*sps)++;
+}
+
+static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
+					 struct kvm_ppc_smmu_info *info)
+{
+	struct kvm_ppc_one_seg_page_size *sps;
+
+	/*
+	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
+	 * POWER7 doesn't support keys for instruction accesses,
+	 * POWER8 and POWER9 do.
+	 */
+	info->data_keys = 32;
+	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
+
+	/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
+	info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
+	info->slb_size = 32;
+
+	/* We only support these sizes for now, and no muti-size segments */
+	sps = &info->sps[0];
+	kvmppc_add_seg_page_size(&sps, 12, 0);
+	kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
+	kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
+
+	/* If running as a nested hypervisor, we don't support HPT guests */
+	if (kvmhv_on_pseries())
+		info->flags |= KVM_PPC_NO_HASH;
+
+	return 0;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
+					 struct kvm_dirty_log *log)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int r;
+	unsigned long n, i;
+	unsigned long *buf, *p;
+	struct kvm_vcpu *vcpu;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = -EINVAL;
+	if (log->slot >= KVM_USER_MEM_SLOTS)
+		goto out;
+
+	slots = kvm_memslots(kvm);
+	memslot = id_to_memslot(slots, log->slot);
+	r = -ENOENT;
+	if (!memslot || !memslot->dirty_bitmap)
+		goto out;
+
+	/*
+	 * Use second half of bitmap area because both HPT and radix
+	 * accumulate bits in the first half.
+	 */
+	n = kvm_dirty_bitmap_bytes(memslot);
+	buf = memslot->dirty_bitmap + n / sizeof(long);
+	memset(buf, 0, n);
+
+	if (kvm_is_radix(kvm))
+		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
+	else
+		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
+	if (r)
+		goto out;
+
+	/*
+	 * We accumulate dirty bits in the first half of the
+	 * memslot's dirty_bitmap area, for when pages are paged
+	 * out or modified by the host directly.  Pick up these
+	 * bits and add them to the map.
+	 */
+	p = memslot->dirty_bitmap;
+	for (i = 0; i < n / sizeof(long); ++i)
+		buf[i] |= xchg(&p[i], 0);
+
+	/* Harvest dirty bits from VPA and DTL updates */
+	/* Note: we never modify the SLB shadow buffer areas */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, buf, n))
+		goto out;
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
+static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
+{
+	vfree(slot->arch.rmap);
+	slot->arch.rmap = NULL;
+}
+
+static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
+				const struct kvm_memory_slot *old,
+				struct kvm_memory_slot *new,
+				enum kvm_mr_change change)
+{
+	if (change == KVM_MR_CREATE) {
+		unsigned long size = array_size(new->npages, sizeof(*new->arch.rmap));
+
+		if ((size >> PAGE_SHIFT) > totalram_pages())
+			return -ENOMEM;
+
+		new->arch.rmap = vzalloc(size);
+		if (!new->arch.rmap)
+			return -ENOMEM;
+	} else if (change != KVM_MR_DELETE) {
+		new->arch.rmap = old->arch.rmap;
+	}
+
+	return 0;
+}
+
+static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
+				struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
+				enum kvm_mr_change change)
+{
+	/*
+	 * If we are creating or modifying a memslot, it might make
+	 * some address that was previously cached as emulated
+	 * MMIO be no longer emulated MMIO, so invalidate
+	 * all the caches of emulated MMIO translations.
+	 */
+	if (change != KVM_MR_DELETE)
+		atomic64_inc(&kvm->arch.mmio_update);
+
+	/*
+	 * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
+	 * have already called kvm_arch_flush_shadow_memslot() to
+	 * flush shadow mappings.  For KVM_MR_CREATE we have no
+	 * previous mappings.  So the only case to handle is
+	 * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
+	 * has been changed.
+	 * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
+	 * to get rid of any THP PTEs in the partition-scoped page tables
+	 * so we can track dirtiness at the page level; we flush when
+	 * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
+	 * using THP PTEs.
+	 */
+	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
+	    ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
+		kvmppc_radix_flush_memslot(kvm, old);
+	/*
+	 * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
+	 */
+	if (!kvm->arch.secure_guest)
+		return;
+
+	switch (change) {
+	case KVM_MR_CREATE:
+		/*
+		 * @TODO kvmppc_uvmem_memslot_create() can fail and
+		 * return error. Fix this.
+		 */
+		kvmppc_uvmem_memslot_create(kvm, new);
+		break;
+	case KVM_MR_DELETE:
+		kvmppc_uvmem_memslot_delete(kvm, old);
+		break;
+	default:
+		/* TODO: Handle KVM_MR_MOVE */
+		break;
+	}
+}
+
+/*
+ * Update LPCR values in kvm->arch and in vcores.
+ * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
+ * of kvm->arch.lpcr update).
+ */
+void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
+{
+	long int i;
+	u32 cores_done = 0;
+
+	if ((kvm->arch.lpcr & mask) == lpcr)
+		return;
+
+	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
+
+	for (i = 0; i < KVM_MAX_VCORES; ++i) {
+		struct kvmppc_vcore *vc = kvm->arch.vcores[i];
+		if (!vc)
+			continue;
+
+		spin_lock(&vc->lock);
+		vc->lpcr = (vc->lpcr & ~mask) | lpcr;
+		verify_lpcr(kvm, vc->lpcr);
+		spin_unlock(&vc->lock);
+		if (++cores_done >= kvm->arch.online_vcores)
+			break;
+	}
+}
+
+void kvmppc_setup_partition_table(struct kvm *kvm)
+{
+	unsigned long dw0, dw1;
+
+	if (!kvm_is_radix(kvm)) {
+		/* PS field - page size for VRMA */
+		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
+			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
+		/* HTABSIZE and HTABORG fields */
+		dw0 |= kvm->arch.sdr1;
+
+		/* Second dword as set by userspace */
+		dw1 = kvm->arch.process_table;
+	} else {
+		dw0 = PATB_HR | radix__get_tree_size() |
+			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
+		dw1 = PATB_GR | kvm->arch.process_table;
+	}
+	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
+}
+
+/*
+ * Set up HPT (hashed page table) and RMA (real-mode area).
+ * Must be called with kvm->arch.mmu_setup_lock held.
+ */
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
+{
+	int err = 0;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long hva;
+	struct kvm_memory_slot *memslot;
+	struct vm_area_struct *vma;
+	unsigned long lpcr = 0, senc;
+	unsigned long psize, porder;
+	int srcu_idx;
+
+	/* Allocate hashed page table (if not done already) and reset it */
+	if (!kvm->arch.hpt.virt) {
+		int order = KVM_DEFAULT_HPT_ORDER;
+		struct kvm_hpt_info info;
+
+		err = kvmppc_allocate_hpt(&info, order);
+		/* If we get here, it means userspace didn't specify a
+		 * size explicitly.  So, try successively smaller
+		 * sizes if the default failed. */
+		while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
+			err  = kvmppc_allocate_hpt(&info, order);
+
+		if (err < 0) {
+			pr_err("KVM: Couldn't alloc HPT\n");
+			goto out;
+		}
+
+		kvmppc_set_hpt(kvm, &info);
+	}
+
+	/* Look up the memslot for guest physical address 0 */
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, 0);
+
+	/* We must have some memory at 0 by now */
+	err = -EINVAL;
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		goto out_srcu;
+
+	/* Look up the VMA for the start of this memory slot */
+	hva = memslot->userspace_addr;
+	mmap_read_lock(kvm->mm);
+	vma = vma_lookup(kvm->mm, hva);
+	if (!vma || (vma->vm_flags & VM_IO))
+		goto up_out;
+
+	psize = vma_kernel_pagesize(vma);
+
+	mmap_read_unlock(kvm->mm);
+
+	/* We can handle 4k, 64k or 16M pages in the VRMA */
+	if (psize >= 0x1000000)
+		psize = 0x1000000;
+	else if (psize >= 0x10000)
+		psize = 0x10000;
+	else
+		psize = 0x1000;
+	porder = __ilog2(psize);
+
+	senc = slb_pgsize_encoding(psize);
+	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+		(VRMA_VSID << SLB_VSID_SHIFT_1T);
+	/* Create HPTEs in the hash page table for the VRMA */
+	kvmppc_map_vrma(vcpu, memslot, porder);
+
+	/* Update VRMASD field in the LPCR */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/* the -4 is to account for senc values starting at 0x10 */
+		lpcr = senc << (LPCR_VRMASD_SH - 4);
+		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
+	}
+
+	/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
+	smp_wmb();
+	err = 0;
+ out_srcu:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+ out:
+	return err;
+
+ up_out:
+	mmap_read_unlock(kvm->mm);
+	goto out_srcu;
+}
+
+/*
+ * Must be called with kvm->arch.mmu_setup_lock held and
+ * mmu_ready = 0 and no vcpus running.
+ */
+int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+{
+	unsigned long lpcr, lpcr_mask;
+
+	if (nesting_enabled(kvm))
+		kvmhv_release_all_nested(kvm);
+	kvmppc_rmap_reset(kvm);
+	kvm->arch.process_table = 0;
+	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
+	spin_lock(&kvm->mmu_lock);
+	kvm->arch.radix = 0;
+	spin_unlock(&kvm->mmu_lock);
+	kvmppc_free_radix(kvm);
+
+	lpcr = LPCR_VPM1;
+	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		lpcr_mask |= LPCR_HAIL;
+	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
+
+	return 0;
+}
+
+/*
+ * Must be called with kvm->arch.mmu_setup_lock held and
+ * mmu_ready = 0 and no vcpus running.
+ */
+int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+{
+	unsigned long lpcr, lpcr_mask;
+	int err;
+
+	err = kvmppc_init_vm_radix(kvm);
+	if (err)
+		return err;
+	kvmppc_rmap_reset(kvm);
+	/* Mutual exclusion with kvm_unmap_gfn_range etc. */
+	spin_lock(&kvm->mmu_lock);
+	kvm->arch.radix = 1;
+	spin_unlock(&kvm->mmu_lock);
+	kvmppc_free_hpt(&kvm->arch.hpt);
+
+	lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+	lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+		lpcr_mask |= LPCR_HAIL;
+		if (cpu_has_feature(CPU_FTR_HVMODE) &&
+				(kvm->arch.host_lpcr & LPCR_HAIL))
+			lpcr |= LPCR_HAIL;
+	}
+	kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
+
+	return 0;
+}
+
+#ifdef CONFIG_KVM_XICS
+/*
+ * Allocate a per-core structure for managing state about which cores are
+ * running in the host versus the guest and for exchanging data between
+ * real mode KVM and CPU running in the host.
+ * This is only done for the first VM.
+ * The allocated structure stays even if all VMs have stopped.
+ * It is only freed when the kvm-hv module is unloaded.
+ * It's OK for this routine to fail, we just don't support host
+ * core operations like redirecting H_IPI wakeups.
+ */
+void kvmppc_alloc_host_rm_ops(void)
+{
+	struct kvmppc_host_rm_ops *ops;
+	unsigned long l_ops;
+	int cpu, core;
+	int size;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return;
+
+	/* Not the first time here ? */
+	if (kvmppc_host_rm_ops_hv != NULL)
+		return;
+
+	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+	if (!ops)
+		return;
+
+	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+	ops->rm_core = kzalloc(size, GFP_KERNEL);
+
+	if (!ops->rm_core) {
+		kfree(ops);
+		return;
+	}
+
+	cpus_read_lock();
+
+	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+		if (!cpu_online(cpu))
+			continue;
+
+		core = cpu >> threads_shift;
+		ops->rm_core[core].rm_state.in_host = 1;
+	}
+
+	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+
+	/*
+	 * Make the contents of the kvmppc_host_rm_ops structure visible
+	 * to other CPUs before we assign it to the global variable.
+	 * Do an atomic assignment (no locks used here), but if someone
+	 * beats us to it, just free our copy and return.
+	 */
+	smp_wmb();
+	l_ops = (unsigned long) ops;
+
+	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+		cpus_read_unlock();
+		kfree(ops->rm_core);
+		kfree(ops);
+		return;
+	}
+
+	cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
+					     "ppc/kvm_book3s:prepare",
+					     kvmppc_set_host_core,
+					     kvmppc_clear_host_core);
+	cpus_read_unlock();
+}
+
+void kvmppc_free_host_rm_ops(void)
+{
+	if (kvmppc_host_rm_ops_hv) {
+		cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
+		kfree(kvmppc_host_rm_ops_hv->rm_core);
+		kfree(kvmppc_host_rm_ops_hv);
+		kvmppc_host_rm_ops_hv = NULL;
+	}
+}
+#endif
+
+static int kvmppc_core_init_vm_hv(struct kvm *kvm)
+{
+	unsigned long lpcr, lpid;
+	int ret;
+
+	mutex_init(&kvm->arch.uvmem_lock);
+	INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
+	mutex_init(&kvm->arch.mmu_setup_lock);
+
+	/* Allocate the guest's logical partition ID */
+
+	lpid = kvmppc_alloc_lpid();
+	if ((long)lpid < 0)
+		return -ENOMEM;
+	kvm->arch.lpid = lpid;
+
+	kvmppc_alloc_host_rm_ops();
+
+	kvmhv_vm_nested_init(kvm);
+
+	/*
+	 * Since we don't flush the TLB when tearing down a VM,
+	 * and this lpid might have previously been used,
+	 * make sure we flush on each core before running the new VM.
+	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
+	 * does this flush for us.
+	 */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+
+	/* Start out with the default set of hcalls enabled */
+	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
+	       sizeof(kvm->arch.enabled_hcalls));
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+
+	/* Init LPCR for virtual RMA mode */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		kvm->arch.host_lpid = mfspr(SPRN_LPID);
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+		lpcr &= LPCR_PECE | LPCR_LPES;
+	} else {
+		/*
+		 * The L2 LPES mode will be set by the L0 according to whether
+		 * or not it needs to take external interrupts in HV mode.
+		 */
+		lpcr = 0;
+	}
+	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+		LPCR_VPM0 | LPCR_VPM1;
+	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
+		(VRMA_VSID << SLB_VSID_SHIFT_1T);
+	/* On POWER8 turn on online bit to enable PURR/SPURR */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		lpcr |= LPCR_ONL;
+	/*
+	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
+	 * Set HVICE bit to enable hypervisor virtualization interrupts.
+	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
+	 * be unnecessary but better safe than sorry in case we re-enable
+	 * EE in HV mode with this LPCR still set)
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		lpcr &= ~LPCR_VPM0;
+		lpcr |= LPCR_HVICE | LPCR_HEIC;
+
+		/*
+		 * If xive is enabled, we route 0x500 interrupts directly
+		 * to the guest.
+		 */
+		if (xics_on_xive())
+			lpcr |= LPCR_LPES;
+	}
+
+	/*
+	 * If the host uses radix, the guest starts out as radix.
+	 */
+	if (radix_enabled()) {
+		kvm->arch.radix = 1;
+		kvm->arch.mmu_ready = 1;
+		lpcr &= ~LPCR_VPM1;
+		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+		if (cpu_has_feature(CPU_FTR_HVMODE) &&
+		    cpu_has_feature(CPU_FTR_ARCH_31) &&
+		    (kvm->arch.host_lpcr & LPCR_HAIL))
+			lpcr |= LPCR_HAIL;
+		ret = kvmppc_init_vm_radix(kvm);
+		if (ret) {
+			kvmppc_free_lpid(kvm->arch.lpid);
+			return ret;
+		}
+		kvmppc_setup_partition_table(kvm);
+	}
+
+	verify_lpcr(kvm, lpcr);
+	kvm->arch.lpcr = lpcr;
+
+	/* Initialization for future HPT resizes */
+	kvm->arch.resize_hpt = NULL;
+
+	/*
+	 * Work out how many sets the TLB has, for the use of
+	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+		/*
+		 * P10 will flush all the congruence class with a single tlbiel
+		 */
+		kvm->arch.tlb_sets = 1;
+	} else if (radix_enabled())
+		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
+	else
+		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */
+
+	/*
+	 * Track that we now have a HV mode VM active. This blocks secondary
+	 * CPU threads from coming online.
+	 */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_activated();
+
+	/*
+	 * Initialize smt_mode depending on processor.
+	 * POWER8 and earlier have to use "strict" threading, where
+	 * all vCPUs in a vcore have to run on the same (sub)core,
+	 * whereas on POWER9 the threads can each run a different
+	 * guest.
+	 */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm->arch.smt_mode = threads_per_subcore;
+	else
+		kvm->arch.smt_mode = 1;
+	kvm->arch.emul_smt_mode = 1;
+
+	return 0;
+}
+
+static int kvmppc_arch_create_vm_debugfs_hv(struct kvm *kvm)
+{
+	kvmppc_mmu_debugfs_init(kvm);
+	if (radix_enabled())
+		kvmhv_radix_debugfs_init(kvm);
+	return 0;
+}
+
+static void kvmppc_free_vcores(struct kvm *kvm)
+{
+	long int i;
+
+	for (i = 0; i < KVM_MAX_VCORES; ++i)
+		kfree(kvm->arch.vcores[i]);
+	kvm->arch.online_vcores = 0;
+}
+
+static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_deactivated();
+
+	kvmppc_free_vcores(kvm);
+
+
+	if (kvm_is_radix(kvm))
+		kvmppc_free_radix(kvm);
+	else
+		kvmppc_free_hpt(&kvm->arch.hpt);
+
+	/* Perform global invalidation and return lpid to the pool */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (nesting_enabled(kvm))
+			kvmhv_release_all_nested(kvm);
+		kvm->arch.process_table = 0;
+		if (kvm->arch.secure_guest)
+			uv_svm_terminate(kvm->arch.lpid);
+		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
+	}
+
+	kvmppc_free_lpid(kvm->arch.lpid);
+
+	kvmppc_free_pimap(kvm);
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
+				     unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
+					ulong spr_val)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
+					ulong *spr_val)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_core_check_processor_compat_hv(void)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_206))
+		return 0;
+
+	/* POWER9 in radix mode is capable of being a nested hypervisor. */
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
+		return 0;
+
+	return -EIO;
+}
+
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+	kfree(kvm->arch.pimap);
+}
+
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+	struct irq_desc *desc;
+	struct kvmppc_irq_map *irq_map;
+	struct kvmppc_passthru_irqmap *pimap;
+	struct irq_chip *chip;
+	int i, rc = 0;
+	struct irq_data *host_data;
+
+	if (!kvm_irq_bypass)
+		return 1;
+
+	desc = irq_to_desc(host_irq);
+	if (!desc)
+		return -EIO;
+
+	mutex_lock(&kvm->lock);
+
+	pimap = kvm->arch.pimap;
+	if (pimap == NULL) {
+		/* First call, allocate structure to hold IRQ map */
+		pimap = kvmppc_alloc_pimap();
+		if (pimap == NULL) {
+			mutex_unlock(&kvm->lock);
+			return -ENOMEM;
+		}
+		kvm->arch.pimap = pimap;
+	}
+
+	/*
+	 * For now, we only support interrupts for which the EOI operation
+	 * is an OPAL call followed by a write to XIRR, since that's
+	 * what our real-mode EOI code does, or a XIVE interrupt
+	 */
+	chip = irq_data_get_irq_chip(&desc->irq_data);
+	if (!chip || !is_pnv_opal_msi(chip)) {
+		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+			host_irq, guest_gsi);
+		mutex_unlock(&kvm->lock);
+		return -ENOENT;
+	}
+
+	/*
+	 * See if we already have an entry for this guest IRQ number.
+	 * If it's mapped to a hardware IRQ number, that's an error,
+	 * otherwise re-use this entry.
+	 */
+	for (i = 0; i < pimap->n_mapped; i++) {
+		if (guest_gsi == pimap->mapped[i].v_hwirq) {
+			if (pimap->mapped[i].r_hwirq) {
+				mutex_unlock(&kvm->lock);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+
+	if (i == KVMPPC_PIRQ_MAPPED) {
+		mutex_unlock(&kvm->lock);
+		return -EAGAIN;		/* table is full */
+	}
+
+	irq_map = &pimap->mapped[i];
+
+	irq_map->v_hwirq = guest_gsi;
+	irq_map->desc = desc;
+
+	/*
+	 * Order the above two stores before the next to serialize with
+	 * the KVM real mode handler.
+	 */
+	smp_wmb();
+
+	/*
+	 * The 'host_irq' number is mapped in the PCI-MSI domain but
+	 * the underlying calls, which will EOI the interrupt in real
+	 * mode, need an HW IRQ number mapped in the XICS IRQ domain.
+	 */
+	host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
+	irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
+
+	if (i == pimap->n_mapped)
+		pimap->n_mapped++;
+
+	if (xics_on_xive())
+		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
+	else
+		kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
+	if (rc)
+		irq_map->r_hwirq = 0;
+
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+	struct irq_desc *desc;
+	struct kvmppc_passthru_irqmap *pimap;
+	int i, rc = 0;
+
+	if (!kvm_irq_bypass)
+		return 0;
+
+	desc = irq_to_desc(host_irq);
+	if (!desc)
+		return -EIO;
+
+	mutex_lock(&kvm->lock);
+	if (!kvm->arch.pimap)
+		goto unlock;
+
+	pimap = kvm->arch.pimap;
+
+	for (i = 0; i < pimap->n_mapped; i++) {
+		if (guest_gsi == pimap->mapped[i].v_hwirq)
+			break;
+	}
+
+	if (i == pimap->n_mapped) {
+		mutex_unlock(&kvm->lock);
+		return -ENODEV;
+	}
+
+	if (xics_on_xive())
+		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
+	else
+		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
+	/* invalidate the entry (what to do on error from the above ?) */
+	pimap->mapped[i].r_hwirq = 0;
+
+	/*
+	 * We don't free this structure even when the count goes to
+	 * zero. The structure is freed when we destroy the VM.
+	 */
+ unlock:
+	mutex_unlock(&kvm->lock);
+	return rc;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+					     struct irq_bypass_producer *prod)
+{
+	int ret = 0;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	irqfd->producer = prod;
+
+	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+	if (ret)
+		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+			prod->irq, irqfd->gsi, ret);
+
+	return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+					      struct irq_bypass_producer *prod)
+{
+	int ret;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	irqfd->producer = NULL;
+
+	/*
+	 * When producer of consumer is unregistered, we change back to
+	 * default external interrupt handling mode - KVM real mode
+	 * will switch back to host.
+	 */
+	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+	if (ret)
+		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+			prod->irq, irqfd->gsi, ret);
+}
+#endif
+
+static int kvm_arch_vm_ioctl_hv(struct file *filp,
+				unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm __maybe_unused = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	switch (ioctl) {
+
+	case KVM_PPC_ALLOCATE_HTAB: {
+		u32 htab_order;
+
+		/* If we're a nested hypervisor, we currently only support radix */
+		if (kvmhv_on_pseries()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+
+		r = -EFAULT;
+		if (get_user(htab_order, (u32 __user *)argp))
+			break;
+		r = kvmppc_alloc_reset_hpt(kvm, htab_order);
+		if (r)
+			break;
+		r = 0;
+		break;
+	}
+
+	case KVM_PPC_GET_HTAB_FD: {
+		struct kvm_get_htab_fd ghf;
+
+		r = -EFAULT;
+		if (copy_from_user(&ghf, argp, sizeof(ghf)))
+			break;
+		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
+		break;
+	}
+
+	case KVM_PPC_RESIZE_HPT_PREPARE: {
+		struct kvm_ppc_resize_hpt rhpt;
+
+		r = -EFAULT;
+		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+			break;
+
+		r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
+		break;
+	}
+
+	case KVM_PPC_RESIZE_HPT_COMMIT: {
+		struct kvm_ppc_resize_hpt rhpt;
+
+		r = -EFAULT;
+		if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+			break;
+
+		r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
+		break;
+	}
+
+	default:
+		r = -ENOTTY;
+	}
+
+	return r;
+}
+
+/*
+ * List of hcall numbers to enable by default.
+ * For compatibility with old userspace, we enable by default
+ * all hcalls that were implemented before the hcall-enabling
+ * facility was added.  Note this list should not include H_RTAS.
+ */
+static unsigned int default_hcall_list[] = {
+	H_REMOVE,
+	H_ENTER,
+	H_READ,
+	H_PROTECT,
+	H_BULK_REMOVE,
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	H_GET_TCE,
+	H_PUT_TCE,
+#endif
+	H_SET_DABR,
+	H_SET_XDABR,
+	H_CEDE,
+	H_PROD,
+	H_CONFER,
+	H_REGISTER_VPA,
+#ifdef CONFIG_KVM_XICS
+	H_EOI,
+	H_CPPR,
+	H_IPI,
+	H_IPOLL,
+	H_XIRR,
+	H_XIRR_X,
+#endif
+	0
+};
+
+static void init_default_hcalls(void)
+{
+	int i;
+	unsigned int hcall;
+
+	for (i = 0; default_hcall_list[i]; ++i) {
+		hcall = default_hcall_list[i];
+		WARN_ON(!kvmppc_hcall_impl_hv(hcall));
+		__set_bit(hcall / 4, default_enabled_hcalls);
+	}
+}
+
+static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+	unsigned long lpcr;
+	int radix;
+	int err;
+
+	/* If not on a POWER9, reject it */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+
+	/* If any unknown flags set, reject it */
+	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
+		return -EINVAL;
+
+	/* GR (guest radix) bit in process_table field must match */
+	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
+	if (!!(cfg->process_table & PATB_GR) != radix)
+		return -EINVAL;
+
+	/* Process table size field must be reasonable, i.e. <= 24 */
+	if ((cfg->process_table & PRTS_MASK) > 24)
+		return -EINVAL;
+
+	/* We can change a guest to/from radix now, if the host is radix */
+	if (radix && !radix_enabled())
+		return -EINVAL;
+
+	/* If we're a nested hypervisor, we currently only support radix */
+	if (kvmhv_on_pseries() && !radix)
+		return -EINVAL;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+	if (radix != kvm_is_radix(kvm)) {
+		if (kvm->arch.mmu_ready) {
+			kvm->arch.mmu_ready = 0;
+			/* order mmu_ready vs. vcpus_running */
+			smp_mb();
+			if (atomic_read(&kvm->arch.vcpus_running)) {
+				kvm->arch.mmu_ready = 1;
+				err = -EBUSY;
+				goto out_unlock;
+			}
+		}
+		if (radix)
+			err = kvmppc_switch_mmu_to_radix(kvm);
+		else
+			err = kvmppc_switch_mmu_to_hpt(kvm);
+		if (err)
+			goto out_unlock;
+	}
+
+	kvm->arch.process_table = cfg->process_table;
+	kvmppc_setup_partition_table(kvm);
+
+	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
+	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+	err = 0;
+
+ out_unlock:
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+	return err;
+}
+
+static int kvmhv_enable_nested(struct kvm *kvm)
+{
+	if (!nested)
+		return -EPERM;
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+	if (!radix_enabled())
+		return -ENODEV;
+
+	/* kvm == NULL means the caller is testing if the capability exists */
+	if (kvm)
+		kvm->arch.nested_enable = true;
+	return 0;
+}
+
+static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+				 int size)
+{
+	int rc = -EINVAL;
+
+	if (kvmhv_vcpu_is_radix(vcpu)) {
+		rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
+
+		if (rc > 0)
+			rc = -EINVAL;
+	}
+
+	/* For now quadrants are the only way to access nested guest memory */
+	if (rc && vcpu->arch.nested)
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+				int size)
+{
+	int rc = -EINVAL;
+
+	if (kvmhv_vcpu_is_radix(vcpu)) {
+		rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
+
+		if (rc > 0)
+			rc = -EINVAL;
+	}
+
+	/* For now quadrants are the only way to access nested guest memory */
+	if (rc && vcpu->arch.nested)
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
+{
+	unpin_vpa(kvm, vpa);
+	vpa->gpa = 0;
+	vpa->pinned_addr = NULL;
+	vpa->dirty = false;
+	vpa->update_pending = 0;
+}
+
+/*
+ * Enable a guest to become a secure VM, or test whether
+ * that could be enabled.
+ * Called when the KVM_CAP_PPC_SECURE_GUEST capability is
+ * tested (kvm == NULL) or enabled (kvm != NULL).
+ */
+static int kvmhv_enable_svm(struct kvm *kvm)
+{
+	if (!kvmppc_uvmem_available())
+		return -EINVAL;
+	if (kvm)
+		kvm->arch.svm_enabled = 1;
+	return 0;
+}
+
+/*
+ *  IOCTL handler to turn off secure mode of guest
+ *
+ * - Release all device pages
+ * - Issue ucall to terminate the guest on the UV side
+ * - Unpin the VPA pages.
+ * - Reinit the partition scoped page tables
+ */
+static int kvmhv_svm_off(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int mmu_was_ready;
+	int srcu_idx;
+	int ret = 0;
+	unsigned long i;
+
+	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
+		return ret;
+
+	mutex_lock(&kvm->arch.mmu_setup_lock);
+	mmu_was_ready = kvm->arch.mmu_ready;
+	if (kvm->arch.mmu_ready) {
+		kvm->arch.mmu_ready = 0;
+		/* order mmu_ready vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.mmu_ready = 1;
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		struct kvm_memory_slot *memslot;
+		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
+		int bkt;
+
+		if (!slots)
+			continue;
+
+		kvm_for_each_memslot(memslot, bkt, slots) {
+			kvmppc_uvmem_drop_pages(memslot, kvm, true);
+			uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
+		}
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	ret = uv_svm_terminate(kvm->arch.lpid);
+	if (ret != U_SUCCESS) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * When secure guest is reset, all the guest pages are sent
+	 * to UV via UV_PAGE_IN before the non-boot vcpus get a
+	 * chance to run and unpin their VPA pages. Unpinning of all
+	 * VPA pages is done here explicitly so that VPA pages
+	 * can be migrated to the secure side.
+	 *
+	 * This is required to for the secure SMP guest to reboot
+	 * correctly.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		unpin_vpa_reset(kvm, &vcpu->arch.dtl);
+		unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
+		unpin_vpa_reset(kvm, &vcpu->arch.vpa);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
+
+	kvmppc_setup_partition_table(kvm);
+	kvm->arch.secure_guest = 0;
+	kvm->arch.mmu_ready = mmu_was_ready;
+out:
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
+	return ret;
+}
+
+static int kvmhv_enable_dawr1(struct kvm *kvm)
+{
+	if (!cpu_has_feature(CPU_FTR_DAWR1))
+		return -ENODEV;
+
+	/* kvm == NULL means the caller is testing if the capability exists */
+	if (kvm)
+		kvm->arch.dawr1_enabled = true;
+	return 0;
+}
+
+static bool kvmppc_hash_v3_possible(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return false;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return false;
+
+	/*
+	 * POWER9 chips before version 2.02 can't have some threads in
+	 * HPT mode and some in radix mode on the same core.
+	 */
+	if (radix_enabled()) {
+		unsigned int pvr = mfspr(SPRN_PVR);
+		if ((pvr >> 16) == PVR_POWER9 &&
+		    (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
+		     ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
+			return false;
+	}
+
+	return true;
+}
+
+static struct kvmppc_ops kvm_ops_hv = {
+	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
+	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
+	.get_one_reg = kvmppc_get_one_reg_hv,
+	.set_one_reg = kvmppc_set_one_reg_hv,
+	.vcpu_load   = kvmppc_core_vcpu_load_hv,
+	.vcpu_put    = kvmppc_core_vcpu_put_hv,
+	.inject_interrupt = kvmppc_inject_interrupt_hv,
+	.set_msr     = kvmppc_set_msr_hv,
+	.vcpu_run    = kvmppc_vcpu_run_hv,
+	.vcpu_create = kvmppc_core_vcpu_create_hv,
+	.vcpu_free   = kvmppc_core_vcpu_free_hv,
+	.check_requests = kvmppc_core_check_requests_hv,
+	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
+	.flush_memslot  = kvmppc_core_flush_memslot_hv,
+	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
+	.commit_memory_region  = kvmppc_core_commit_memory_region_hv,
+	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
+	.age_gfn = kvm_age_gfn_hv,
+	.test_age_gfn = kvm_test_age_gfn_hv,
+	.set_spte_gfn = kvm_set_spte_gfn_hv,
+	.free_memslot = kvmppc_core_free_memslot_hv,
+	.init_vm =  kvmppc_core_init_vm_hv,
+	.destroy_vm = kvmppc_core_destroy_vm_hv,
+	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
+	.emulate_op = kvmppc_core_emulate_op_hv,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
+	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
+	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
+	.hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
+	.configure_mmu = kvmhv_configure_mmu,
+	.get_rmmu_info = kvmhv_get_rmmu_info,
+	.set_smt_mode = kvmhv_set_smt_mode,
+	.enable_nested = kvmhv_enable_nested,
+	.load_from_eaddr = kvmhv_load_from_eaddr,
+	.store_to_eaddr = kvmhv_store_to_eaddr,
+	.enable_svm = kvmhv_enable_svm,
+	.svm_off = kvmhv_svm_off,
+	.enable_dawr1 = kvmhv_enable_dawr1,
+	.hash_v3_possible = kvmppc_hash_v3_possible,
+	.create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv,
+	.create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv,
+};
+
+static int kvm_init_subcore_bitmap(void)
+{
+	int i, j;
+	int nr_cores = cpu_nr_cores();
+	struct sibling_subcore_state *sibling_subcore_state;
+
+	for (i = 0; i < nr_cores; i++) {
+		int first_cpu = i * threads_per_core;
+		int node = cpu_to_node(first_cpu);
+
+		/* Ignore if it is already allocated. */
+		if (paca_ptrs[first_cpu]->sibling_subcore_state)
+			continue;
+
+		sibling_subcore_state =
+			kzalloc_node(sizeof(struct sibling_subcore_state),
+							GFP_KERNEL, node);
+		if (!sibling_subcore_state)
+			return -ENOMEM;
+
+
+		for (j = 0; j < threads_per_core; j++) {
+			int cpu = first_cpu + j;
+
+			paca_ptrs[cpu]->sibling_subcore_state =
+						sibling_subcore_state;
+		}
+	}
+	return 0;
+}
+
+static int kvmppc_radix_possible(void)
+{
+	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
+}
+
+static int kvmppc_book3s_init_hv(void)
+{
+	int r;
+
+	if (!tlbie_capable) {
+		pr_err("KVM-HV: Host does not support TLBIE\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * FIXME!! Do we need to check on all cpus ?
+	 */
+	r = kvmppc_core_check_processor_compat_hv();
+	if (r < 0)
+		return -ENODEV;
+
+	r = kvmhv_nested_init();
+	if (r)
+		return r;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		r = kvm_init_subcore_bitmap();
+		if (r)
+			goto err;
+	}
+
+	/*
+	 * We need a way of accessing the XICS interrupt controller,
+	 * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
+	 * indirectly, via OPAL.
+	 */
+#ifdef CONFIG_SMP
+	if (!xics_on_xive() && !kvmhv_on_pseries() &&
+	    !local_paca->kvm_hstate.xics_phys) {
+		struct device_node *np;
+
+		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
+		if (!np) {
+			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
+			r = -ENODEV;
+			goto err;
+		}
+		/* presence of intc confirmed - node can be dropped again */
+		of_node_put(np);
+	}
+#endif
+
+	init_default_hcalls();
+
+	init_vcore_lists();
+
+	r = kvmppc_mmu_hv_init();
+	if (r)
+		goto err;
+
+	if (kvmppc_radix_possible()) {
+		r = kvmppc_radix_init();
+		if (r)
+			goto err;
+	}
+
+	r = kvmppc_uvmem_init();
+	if (r < 0) {
+		pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
+		return r;
+	}
+
+	kvm_ops_hv.owner = THIS_MODULE;
+	kvmppc_hv_ops = &kvm_ops_hv;
+
+	return 0;
+
+err:
+	kvmhv_nested_exit();
+	kvmppc_radix_exit();
+
+	return r;
+}
+
+static void kvmppc_book3s_exit_hv(void)
+{
+	kvmppc_uvmem_free();
+	kvmppc_free_host_rm_ops();
+	if (kvmppc_radix_possible())
+		kvmppc_radix_exit();
+	kvmppc_hv_ops = NULL;
+	kvmhv_nested_exit();
+}
+
+module_init(kvmppc_book3s_init_hv);
+module_exit(kvmppc_book3s_exit_hv);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
diff --git a/arch/powerpc/kvm/book3s_hv.h b/arch/powerpc/kvm/book3s_hv.h
new file mode 100644
index 0000000000..95241764df
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.h
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Privileged (non-hypervisor) host registers to save.
+ */
+struct p9_host_os_sprs {
+	unsigned long iamr;
+	unsigned long amr;
+
+	unsigned int pmc1;
+	unsigned int pmc2;
+	unsigned int pmc3;
+	unsigned int pmc4;
+	unsigned int pmc5;
+	unsigned int pmc6;
+	unsigned long mmcr0;
+	unsigned long mmcr1;
+	unsigned long mmcr2;
+	unsigned long mmcr3;
+	unsigned long mmcra;
+	unsigned long siar;
+	unsigned long sier1;
+	unsigned long sier2;
+	unsigned long sier3;
+	unsigned long sdar;
+};
+
+static inline bool nesting_enabled(struct kvm *kvm)
+{
+	return kvm->arch.nested_enable && kvm_is_radix(kvm);
+}
+
+bool load_vcpu_state(struct kvm_vcpu *vcpu,
+			   struct p9_host_os_sprs *host_os_sprs);
+void store_vcpu_state(struct kvm_vcpu *vcpu);
+void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs);
+void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
+				    struct p9_host_os_sprs *host_os_sprs);
+void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
+			    struct p9_host_os_sprs *host_os_sprs);
+void switch_pmu_to_host(struct kvm_vcpu *vcpu,
+			    struct p9_host_os_sprs *host_os_sprs);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING
+void accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next);
+#define start_timing(vcpu, next) accumulate_time(vcpu, next)
+#define end_timing(vcpu) accumulate_time(vcpu, NULL)
+#else
+#define accumulate_time(vcpu, next) do {} while (0)
+#define start_timing(vcpu, next) do {} while (0)
+#define end_timing(vcpu) do {} while (0)
+#endif
+
+static inline void __kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 val)
+{
+	vcpu->arch.shregs.msr = val;
+}
+
+static inline u64 __kvmppc_get_msr_hv(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shregs.msr;
+}
+
+#define KVMPPC_BOOK3S_HV_VCPU_ACCESSOR_SET(reg, size)			\
+static inline void kvmppc_set_##reg ##_hv(struct kvm_vcpu *vcpu, u##size val)	\
+{									\
+	vcpu->arch.reg = val;						\
+}
+
+#define KVMPPC_BOOK3S_HV_VCPU_ACCESSOR_GET(reg, size)			\
+static inline u##size kvmppc_get_##reg ##_hv(struct kvm_vcpu *vcpu)	\
+{									\
+	return vcpu->arch.reg;						\
+}
+
+#define KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(reg, size)			\
+	KVMPPC_BOOK3S_HV_VCPU_ACCESSOR_SET(reg, size)			\
+	KVMPPC_BOOK3S_HV_VCPU_ACCESSOR_GET(reg, size)			\
+
+#define KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR_SET(reg, size)		\
+static inline void kvmppc_set_##reg ##_hv(struct kvm_vcpu *vcpu, int i, u##size val)	\
+{									\
+	vcpu->arch.reg[i] = val;					\
+}
+
+#define KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR_GET(reg, size)		\
+static inline u##size kvmppc_get_##reg ##_hv(struct kvm_vcpu *vcpu, int i)	\
+{									\
+	return vcpu->arch.reg[i];					\
+}
+
+#define KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR(reg, size)			\
+	KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR_SET(reg, size)		\
+	KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR_GET(reg, size)		\
+
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(mmcra, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(hfscr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(fscr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(dscr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(purr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(spurr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(amr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(uamor, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(siar, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(sdar, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(iamr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(dawr0, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(dawr1, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(dawrx0, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(dawrx1, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(ciabr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(wort, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(ppr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(ctrl, 64)
+
+KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR(mmcr, 64)
+KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR(sier, 64)
+KVMPPC_BOOK3S_HV_VCPU_ARRAY_ACCESSOR(pmc, 32)
+
+KVMPPC_BOOK3S_HV_VCPU_ACCESSOR(pspb, 32)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644
index 0000000000..663f5222f3
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -0,0 +1,625 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>
+#include <linux/cma.h>
+#include <linux/bitops.h>
+
+#include <asm/cputable.h>
+#include <asm/interrupt.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
+#include <asm/io.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+
+#define KVM_CMA_CHUNK_ORDER	18
+
+#include "book3s_xics.h"
+#include "book3s_xive.h"
+#include "book3s_hv.h"
+
+/*
+ * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
+ * should be power of 2.
+ */
+#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */
+/*
+ * By default we reserve 5% of memory for hash pagetable allocation.
+ */
+static unsigned long kvm_cma_resv_ratio = 5;
+
+static struct cma *kvm_cma;
+
+static int __init early_parse_kvm_cma_resv(char *p)
+{
+	pr_debug("%s(%s)\n", __func__, p);
+	if (!p)
+		return -EINVAL;
+	return kstrtoul(p, 0, &kvm_cma_resv_ratio);
+}
+early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
+
+struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
+{
+	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
+	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES),
+			 false);
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
+
+void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
+{
+	cma_release(kvm_cma, page, nr_pages);
+}
+EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
+
+/**
+ * kvm_cma_reserve() - reserve area for kvm hash pagetable
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the memblock allocator
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init kvm_cma_reserve(void)
+{
+	unsigned long align_size;
+	phys_addr_t selected_size;
+
+	/*
+	 * We need CMA reservation only when we are in HV mode
+	 */
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return;
+
+	selected_size = PAGE_ALIGN(memblock_phys_mem_size() * kvm_cma_resv_ratio / 100);
+	if (selected_size) {
+		pr_info("%s: reserving %ld MiB for global area\n", __func__,
+			 (unsigned long)selected_size / SZ_1M);
+		align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
+		cma_declare_contiguous(0, selected_size, 0, align_size,
+			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, "kvm_cma",
+			&kvm_cma);
+	}
+}
+
+/*
+ * Real-mode H_CONFER implementation.
+ * We check if we are the only vcpu out of this virtual core
+ * still running in the guest and not ceded.  If so, we pop up
+ * to the virtual-mode implementation; if not, just return to
+ * the guest.
+ */
+long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
+			    unsigned int yield_count)
+{
+	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+	int ptid = local_paca->kvm_hstate.ptid;
+	int threads_running;
+	int threads_ceded;
+	int threads_conferring;
+	u64 stop = get_tb() + 10 * tb_ticks_per_usec;
+	int rv = H_SUCCESS; /* => don't yield */
+
+	set_bit(ptid, &vc->conferring_threads);
+	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+		threads_running = VCORE_ENTRY_MAP(vc);
+		threads_ceded = vc->napping_threads;
+		threads_conferring = vc->conferring_threads;
+		if ((threads_ceded | threads_conferring) == threads_running) {
+			rv = H_TOO_HARD; /* => do yield */
+			break;
+		}
+	}
+	clear_bit(ptid, &vc->conferring_threads);
+	return rv;
+}
+
+/*
+ * When running HV mode KVM we need to block certain operations while KVM VMs
+ * exist in the system. We use a counter of VMs to track this.
+ *
+ * One of the operations we need to block is onlining of secondaries, so we
+ * protect hv_vm_count with cpus_read_lock/unlock().
+ */
+static atomic_t hv_vm_count;
+
+void kvm_hv_vm_activated(void)
+{
+	cpus_read_lock();
+	atomic_inc(&hv_vm_count);
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(kvm_hv_vm_activated);
+
+void kvm_hv_vm_deactivated(void)
+{
+	cpus_read_lock();
+	atomic_dec(&hv_vm_count);
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated);
+
+bool kvm_hv_mode_active(void)
+{
+	return atomic_read(&hv_vm_count) != 0;
+}
+
+extern int hcall_real_table[], hcall_real_table_end[];
+
+int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
+{
+	cmd /= 4;
+	if (cmd < hcall_real_table_end - hcall_real_table &&
+	    hcall_real_table[cmd])
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+	return ppc_md.get_random_seed != NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_rm_h_random(struct kvm_vcpu *vcpu)
+{
+	if (ppc_md.get_random_seed &&
+	    ppc_md.get_random_seed(&vcpu->arch.regs.gpr[4]))
+		return H_SUCCESS;
+
+	return H_HARDWARE;
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+	void __iomem *xics_phys;
+	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+
+	/* On POWER9 we can use msgsnd for any destination cpu. */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		msg |= get_hard_smp_processor_id(cpu);
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return;
+	}
+
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+	    cpu_first_thread_sibling(cpu) ==
+	    cpu_first_thread_sibling(raw_smp_processor_id())) {
+		msg |= cpu_thread_in_core(cpu);
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return;
+	}
+
+	/* We should never reach this */
+	if (WARN_ON_ONCE(xics_on_xive()))
+	    return;
+
+	/* Else poke the target with an IPI */
+	xics_phys = paca_ptrs[cpu]->kvm_hstate.xics_phys;
+	if (xics_phys)
+		__raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
+	else
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+	int cpu = vc->pcpu;
+
+	/* Order setting of exit map vs. msgsnd/IPI */
+	smp_mb();
+	for (; active; active >>= 1, ++cpu)
+		if (active & 1)
+			kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+	int ptid = local_paca->kvm_hstate.ptid;
+	struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
+	int me, ee, i;
+
+	/* Set our bit in the threads-exiting-guest map in the 0xff00
+	   bits of vcore->entry_exit_map */
+	me = 0x100 << ptid;
+	do {
+		ee = vc->entry_exit_map;
+	} while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+	/* Are we the first here? */
+	if ((ee >> 8) != 0)
+		return;
+
+	/*
+	 * Trigger the other threads in this vcore to exit the guest.
+	 * If this is a hypervisor decrementer interrupt then they
+	 * will be already on their way out of the guest.
+	 */
+	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+
+	/*
+	 * If we are doing dynamic micro-threading, interrupt the other
+	 * subcores to pull them out of their guests too.
+	 */
+	if (!sip)
+		return;
+
+	for (i = 0; i < MAX_SUBCORES; ++i) {
+		vc = sip->vc[i];
+		if (!vc)
+			break;
+		do {
+			ee = vc->entry_exit_map;
+			/* Already asked to exit? */
+			if ((ee >> 8) != 0)
+				break;
+		} while (cmpxchg(&vc->entry_exit_map, ee,
+				 ee | VCORE_EXIT_REQ) != ee);
+		if ((ee >> 8) == 0)
+			kvmhv_interrupt_vcore(vc, ee);
+	}
+}
+
+struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+					 u32 xisr)
+{
+	int i;
+
+	/*
+	 * We access the mapped array here without a lock.  That
+	 * is safe because we never reduce the number of entries
+	 * in the array and we never change the v_hwirq field of
+	 * an entry once it is set.
+	 *
+	 * We have also carefully ordered the stores in the writer
+	 * and the loads here in the reader, so that if we find a matching
+	 * hwirq here, the associated GSI and irq_desc fields are valid.
+	 */
+	for (i = 0; i < pimap->n_mapped; i++)  {
+		if (xisr == pimap->mapped[i].r_hwirq) {
+			/*
+			 * Order subsequent reads in the caller to serialize
+			 * with the writer.
+			 */
+			smp_rmb();
+			return &pimap->mapped[i];
+		}
+	}
+	return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
+{
+	struct kvmppc_passthru_irqmap *pimap;
+	struct kvmppc_irq_map *irq_map;
+	struct kvm_vcpu *vcpu;
+
+	vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	if (!vcpu)
+		return 1;
+	pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+	if (!pimap)
+		return 1;
+	irq_map = get_irqmap(pimap, xisr);
+	if (!irq_map)
+		return 1;
+
+	/* We're handling this interrupt, generic code doesn't need to */
+	local_paca->kvm_hstate.saved_xirr = 0;
+
+	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
+{
+	return 1;
+}
+#endif
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *	0 if no interrupt is pending
+ *	1 if an interrupt is pending that needs to be handled by the host
+ *	2 Passthrough that needs completion in the host
+ *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ *	-2 if there is PCI passthrough external interrupt that was handled
+ */
+static long kvmppc_read_one_intr(bool *again);
+
+long kvmppc_read_intr(void)
+{
+	long ret = 0;
+	long rc;
+	bool again;
+
+	if (xive_enabled())
+		return 1;
+
+	do {
+		again = false;
+		rc = kvmppc_read_one_intr(&again);
+		if (rc && (ret == 0 || rc > ret))
+			ret = rc;
+	} while (again);
+	return ret;
+}
+
+static long kvmppc_read_one_intr(bool *again)
+{
+	void __iomem *xics_phys;
+	u32 h_xirr;
+	__be32 xirr;
+	u32 xisr;
+	u8 host_ipi;
+	int64_t rc;
+
+	if (xive_enabled())
+		return 1;
+
+	/* see if a host IPI is pending */
+	host_ipi = READ_ONCE(local_paca->kvm_hstate.host_ipi);
+	if (host_ipi)
+		return 1;
+
+	/* Now read the interrupt from the ICP */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	rc = 0;
+	if (!xics_phys)
+		rc = opal_int_get_xirr(&xirr, false);
+	else
+		xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+	if (rc < 0)
+		return 1;
+
+	/*
+	 * Save XIRR for later. Since we get control in reverse endian
+	 * on LE systems, save it byte reversed and fetch it back in
+	 * host endian. Note that xirr is the value read from the
+	 * XIRR register, while h_xirr is the host endian version.
+	 */
+	h_xirr = be32_to_cpu(xirr);
+	local_paca->kvm_hstate.saved_xirr = h_xirr;
+	xisr = h_xirr & 0xffffff;
+	/*
+	 * Ensure that the store/load complete to guarantee all side
+	 * effects of loading from XIRR has completed
+	 */
+	smp_mb();
+
+	/* if nothing pending in the ICP */
+	if (!xisr)
+		return 0;
+
+	/* We found something in the ICP...
+	 *
+	 * If it is an IPI, clear the MFRR and EOI it.
+	 */
+	if (xisr == XICS_IPI) {
+		rc = 0;
+		if (xics_phys) {
+			__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
+			__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
+		} else {
+			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
+			rc = opal_int_eoi(h_xirr);
+		}
+		/* If rc > 0, there is another interrupt pending */
+		*again = rc > 0;
+
+		/*
+		 * Need to ensure side effects of above stores
+		 * complete before proceeding.
+		 */
+		smp_mb();
+
+		/*
+		 * We need to re-check host IPI now in case it got set in the
+		 * meantime. If it's clear, we bounce the interrupt to the
+		 * guest
+		 */
+		host_ipi = READ_ONCE(local_paca->kvm_hstate.host_ipi);
+		if (unlikely(host_ipi != 0)) {
+			/* We raced with the host,
+			 * we need to resend that IPI, bummer
+			 */
+			if (xics_phys)
+				__raw_rm_writeb(IPI_PRIORITY,
+						xics_phys + XICS_MFRR);
+			else
+				opal_int_set_mfrr(hard_smp_processor_id(),
+						  IPI_PRIORITY);
+			/* Let side effects complete */
+			smp_mb();
+			return 1;
+		}
+
+		/* OK, it's an IPI for us */
+		local_paca->kvm_hstate.saved_xirr = 0;
+		return -1;
+	}
+
+	return kvmppc_check_passthru(xisr, xirr, again);
+}
+
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.ceded = 0;
+	if (vcpu->arch.timer_running) {
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		vcpu->arch.timer_running = 0;
+	}
+}
+
+void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
+{
+	/* Guest must always run with ME enabled, HV disabled. */
+	msr = (msr | MSR_ME) & ~MSR_HV;
+
+	/*
+	 * Check for illegal transactional state bit combination
+	 * and if we find it, force the TS field to a safe state.
+	 */
+	if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
+		msr &= ~MSR_TS_MASK;
+	__kvmppc_set_msr_hv(vcpu, msr);
+	kvmppc_end_cede(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvmppc_set_msr_hv);
+
+static void inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags)
+{
+	unsigned long msr, pc, new_msr, new_pc;
+
+	msr = kvmppc_get_msr(vcpu);
+	pc = kvmppc_get_pc(vcpu);
+	new_msr = vcpu->arch.intr_msr;
+	new_pc = vec;
+
+	/* If transactional, change to suspend mode on IRQ delivery */
+	if (MSR_TM_TRANSACTIONAL(msr))
+		new_msr |= MSR_TS_S;
+	else
+		new_msr |= msr & MSR_TS_MASK;
+
+	/*
+	 * Perform MSR and PC adjustment for LPCR[AIL]=3 if it is set and
+	 * applicable. AIL=2 is not supported.
+	 *
+	 * AIL does not apply to SRESET, MCE, or HMI (which is never
+	 * delivered to the guest), and does not apply if IR=0 or DR=0.
+	 */
+	if (vec != BOOK3S_INTERRUPT_SYSTEM_RESET &&
+	    vec != BOOK3S_INTERRUPT_MACHINE_CHECK &&
+	    (vcpu->arch.vcore->lpcr & LPCR_AIL) == LPCR_AIL_3 &&
+	    (msr & (MSR_IR|MSR_DR)) == (MSR_IR|MSR_DR) ) {
+		new_msr |= MSR_IR | MSR_DR;
+		new_pc += 0xC000000000004000ULL;
+	}
+
+	kvmppc_set_srr0(vcpu, pc);
+	kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags);
+	kvmppc_set_pc(vcpu, new_pc);
+	__kvmppc_set_msr_hv(vcpu, new_msr);
+}
+
+void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags)
+{
+	inject_interrupt(vcpu, vec, srr1_flags);
+	kvmppc_end_cede(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvmppc_inject_interrupt_hv);
+
+/*
+ * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
+ * Can we inject a Decrementer or a External interrupt?
+ */
+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
+{
+	int ext;
+	unsigned long lpcr;
+
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
+	/* Insert EXTERNAL bit into LPCR at the MER bit position */
+	ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
+	lpcr = mfspr(SPRN_LPCR);
+	lpcr |= ext << LPCR_MER_SH;
+	mtspr(SPRN_LPCR, lpcr);
+	isync();
+
+	if (vcpu->arch.shregs.msr & MSR_EE) {
+		if (ext) {
+			inject_interrupt(vcpu, BOOK3S_INTERRUPT_EXTERNAL, 0);
+		} else {
+			long int dec = mfspr(SPRN_DEC);
+			if (!(lpcr & LPCR_LD))
+				dec = (int) dec;
+			if (dec < 0)
+				inject_interrupt(vcpu,
+					BOOK3S_INTERRUPT_DECREMENTER, 0);
+		}
+	}
+
+	if (vcpu->arch.doorbell_request) {
+		mtspr(SPRN_DPDES, 1);
+		vcpu->arch.vcore->dpdes = 1;
+		smp_wmb();
+		vcpu->arch.doorbell_request = 0;
+	}
+}
+
+static void flush_guest_tlb(struct kvm *kvm)
+{
+	unsigned long rb, set;
+
+	rb = PPC_BIT(52);	/* IS = 2 */
+	for (set = 0; set < kvm->arch.tlb_sets; ++set) {
+		/* R=0 PRS=0 RIC=0 */
+		asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+			     : : "r" (rb), "i" (0), "i" (0), "i" (0),
+			       "r" (0) : "memory");
+		rb += PPC_BIT(51);	/* increment set number */
+	}
+	asm volatile("ptesync": : :"memory");
+}
+
+void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu)
+{
+	if (cpumask_test_cpu(pcpu, &kvm->arch.need_tlb_flush)) {
+		flush_guest_tlb(kvm);
+
+		/* Clear the bit after the TLB flush */
+		cpumask_clear_cpu(pcpu, &kvm->arch.need_tlb_flush);
+	}
+}
+EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush);
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
new file mode 100644
index 0000000000..1ec50c6967
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+#include <asm/processor.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+	int i;
+
+	/*
+	 * NULL bitmap pointer indicates that KVM module hasn't
+	 * been loaded yet and hence no guests are running, or running
+	 * on POWER9 or newer CPU.
+	 *
+	 * If no KVM is in use, no need to co-ordinate among threads
+	 * as all of them will always be in host and no one is going
+	 * to modify TB other than the opal hmi handler.
+	 *
+	 * POWER9 and newer don't need this synchronisation.
+	 *
+	 * Hence, just return from here.
+	 */
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+		while (local_paca->sibling_subcore_state->in_guest[i])
+			cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+				&local_paca->sibling_subcore_state->flags))
+		cpu_relax();
+}
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 0000000000..c0deeea7ee
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/linkage.h>
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  none
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+	/* Write correct stack frame */
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save non-volatile registers (r14 - r31) and CR */
+	SAVE_NVGPRS(r1)
+	mfcr	r3
+	std	r3, _CCR(r1)
+
+	/* Save host DSCR */
+	mfspr	r3, SPRN_DSCR
+	std	r3, HSTATE_DSCR(r13)
+
+BEGIN_FTR_SECTION
+	/* Save host DABR */
+	mfspr	r3, SPRN_DABR
+	std	r3, HSTATE_DABR(r13)
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+	/* Save host PMU registers */
+	bl	kvmhv_save_host_pmu
+
+	/*
+	 * Put whatever is in the decrementer into the
+	 * hypervisor decrementer.
+	 * Because of a hardware deviation in P8,
+	 * we need to set LPCR[HDICE] before writing HDEC.
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_KVM(r5)
+	ld	r9, KVM_HOST_LPCR(r6)
+	ori	r8, r9, LPCR_HDICE
+	mtspr	SPRN_LPCR, r8
+	isync
+	mfspr	r8,SPRN_DEC
+	mftb	r7
+	extsw	r8,r8
+	mtspr	SPRN_HDEC,r8
+	add	r8,r8,r7
+	std	r8,HSTATE_DECEXP(r13)
+
+	/* Jump to partition switch code */
+	bl	kvmppc_hv_entry_trampoline
+	nop
+
+/*
+ * We return here in virtual mode after the guest exits
+ * with something that we can't handle in real mode.
+ * Interrupts are still hard-disabled.
+ */
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R3       = trap number on this thread
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 */
+
+	/* Restore non-volatile host registers (r14 - r31) and CR */
+	REST_NVGPRS(r1)
+	ld	r4, _CCR(r1)
+	mtcr	r4
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+/*
+ * void kvmhv_save_host_pmu(void)
+ */
+SYM_FUNC_START_LOCAL(kvmhv_save_host_pmu)
+BEGIN_FTR_SECTION
+	/* Work around P8 PMAE bug */
+	li	r3, -1
+	clrrdi	r3, r3, 10
+	mfspr	r8, SPRN_MMCR2
+	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	mfspr	r6, SPRN_MMCRA
+	/* Clear MMCRA in order to disable SDAR updates */
+	li	r5, 0
+	mtspr	SPRN_MMCRA, r5
+	isync
+	lbz	r5, PACA_PMCINUSE(r13)	/* is the host using the PMU? */
+	cmpwi	r5, 0
+	beq	31f			/* skip if not */
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r9, SPRN_SIAR
+	mfspr	r10, SPRN_SDAR
+	std	r7, HSTATE_MMCR0(r13)
+	std	r5, HSTATE_MMCR1(r13)
+	std	r6, HSTATE_MMCRA(r13)
+	std	r9, HSTATE_SIAR(r13)
+	std	r10, HSTATE_SDAR(r13)
+BEGIN_FTR_SECTION
+	mfspr	r9, SPRN_SIER
+	std	r8, HSTATE_MMCR2(r13)
+	std	r9, HSTATE_SIER(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r5, SPRN_PMC2
+	mfspr	r6, SPRN_PMC3
+	mfspr	r7, SPRN_PMC4
+	mfspr	r8, SPRN_PMC5
+	mfspr	r9, SPRN_PMC6
+	stw	r3, HSTATE_PMC1(r13)
+	stw	r5, HSTATE_PMC2(r13)
+	stw	r6, HSTATE_PMC3(r13)
+	stw	r7, HSTATE_PMC4(r13)
+	stw	r8, HSTATE_PMC5(r13)
+	stw	r9, HSTATE_PMC6(r13)
+31:	blr
+SYM_FUNC_END(kvmhv_save_host_pmu)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644
index 0000000000..377d0b4a05
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,1678 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ *	   Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/llist.h>
+#include <linux/pgtable.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/reg.h>
+#include <asm/plpar_wrappers.h>
+#include <asm/firmware.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
+
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	hr->pcr = vc->pcr | PCR_MASK;
+	hr->dpdes = vc->dpdes;
+	hr->hfscr = vcpu->arch.hfscr;
+	hr->tb_offset = vc->tb_offset;
+	hr->dawr0 = vcpu->arch.dawr0;
+	hr->dawrx0 = vcpu->arch.dawrx0;
+	hr->ciabr = vcpu->arch.ciabr;
+	hr->purr = vcpu->arch.purr;
+	hr->spurr = vcpu->arch.spurr;
+	hr->ic = vcpu->arch.ic;
+	hr->vtb = vc->vtb;
+	hr->srr0 = vcpu->arch.shregs.srr0;
+	hr->srr1 = vcpu->arch.shregs.srr1;
+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
+	hr->pidr = vcpu->arch.pid;
+	hr->cfar = vcpu->arch.cfar;
+	hr->ppr = vcpu->arch.ppr;
+	hr->dawr1 = vcpu->arch.dawr1;
+	hr->dawrx1 = vcpu->arch.dawrx1;
+}
+
+/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */
+static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs)
+{
+	unsigned long *addr = (unsigned long *) regs;
+
+	for (; addr < ((unsigned long *) (regs + 1)); addr++)
+		*addr = swab64(*addr);
+}
+
+static void byteswap_hv_regs(struct hv_guest_state *hr)
+{
+	hr->version = swab64(hr->version);
+	hr->lpid = swab32(hr->lpid);
+	hr->vcpu_token = swab32(hr->vcpu_token);
+	hr->lpcr = swab64(hr->lpcr);
+	hr->pcr = swab64(hr->pcr) | PCR_MASK;
+	hr->amor = swab64(hr->amor);
+	hr->dpdes = swab64(hr->dpdes);
+	hr->hfscr = swab64(hr->hfscr);
+	hr->tb_offset = swab64(hr->tb_offset);
+	hr->dawr0 = swab64(hr->dawr0);
+	hr->dawrx0 = swab64(hr->dawrx0);
+	hr->ciabr = swab64(hr->ciabr);
+	hr->hdec_expiry = swab64(hr->hdec_expiry);
+	hr->purr = swab64(hr->purr);
+	hr->spurr = swab64(hr->spurr);
+	hr->ic = swab64(hr->ic);
+	hr->vtb = swab64(hr->vtb);
+	hr->hdar = swab64(hr->hdar);
+	hr->hdsisr = swab64(hr->hdsisr);
+	hr->heir = swab64(hr->heir);
+	hr->asdr = swab64(hr->asdr);
+	hr->srr0 = swab64(hr->srr0);
+	hr->srr1 = swab64(hr->srr1);
+	hr->sprg[0] = swab64(hr->sprg[0]);
+	hr->sprg[1] = swab64(hr->sprg[1]);
+	hr->sprg[2] = swab64(hr->sprg[2]);
+	hr->sprg[3] = swab64(hr->sprg[3]);
+	hr->pidr = swab64(hr->pidr);
+	hr->cfar = swab64(hr->cfar);
+	hr->ppr = swab64(hr->ppr);
+	hr->dawr1 = swab64(hr->dawr1);
+	hr->dawrx1 = swab64(hr->dawrx1);
+}
+
+static void save_hv_return_state(struct kvm_vcpu *vcpu,
+				 struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	hr->dpdes = vc->dpdes;
+	hr->purr = vcpu->arch.purr;
+	hr->spurr = vcpu->arch.spurr;
+	hr->ic = vcpu->arch.ic;
+	hr->vtb = vc->vtb;
+	hr->srr0 = vcpu->arch.shregs.srr0;
+	hr->srr1 = vcpu->arch.shregs.srr1;
+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
+	hr->pidr = vcpu->arch.pid;
+	hr->cfar = vcpu->arch.cfar;
+	hr->ppr = vcpu->arch.ppr;
+	switch (vcpu->arch.trap) {
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		hr->hdar = vcpu->arch.fault_dar;
+		hr->hdsisr = vcpu->arch.fault_dsisr;
+		hr->asdr = vcpu->arch.fault_gpa;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		hr->asdr = vcpu->arch.fault_gpa;
+		break;
+	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+		hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) |
+			     (HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
+		break;
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		hr->heir = vcpu->arch.emul_inst;
+		break;
+	}
+}
+
+static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	vc->pcr = hr->pcr | PCR_MASK;
+	vc->dpdes = hr->dpdes;
+	vcpu->arch.hfscr = hr->hfscr;
+	vcpu->arch.dawr0 = hr->dawr0;
+	vcpu->arch.dawrx0 = hr->dawrx0;
+	vcpu->arch.ciabr = hr->ciabr;
+	vcpu->arch.purr = hr->purr;
+	vcpu->arch.spurr = hr->spurr;
+	vcpu->arch.ic = hr->ic;
+	vc->vtb = hr->vtb;
+	vcpu->arch.shregs.srr0 = hr->srr0;
+	vcpu->arch.shregs.srr1 = hr->srr1;
+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
+	vcpu->arch.pid = hr->pidr;
+	vcpu->arch.cfar = hr->cfar;
+	vcpu->arch.ppr = hr->ppr;
+	vcpu->arch.dawr1 = hr->dawr1;
+	vcpu->arch.dawrx1 = hr->dawrx1;
+}
+
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+				   struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	vc->dpdes = hr->dpdes;
+	vcpu->arch.hfscr = hr->hfscr;
+	vcpu->arch.purr = hr->purr;
+	vcpu->arch.spurr = hr->spurr;
+	vcpu->arch.ic = hr->ic;
+	vc->vtb = hr->vtb;
+	vcpu->arch.fault_dar = hr->hdar;
+	vcpu->arch.fault_dsisr = hr->hdsisr;
+	vcpu->arch.fault_gpa = hr->asdr;
+	vcpu->arch.emul_inst = hr->heir;
+	vcpu->arch.shregs.srr0 = hr->srr0;
+	vcpu->arch.shregs.srr1 = hr->srr1;
+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
+	vcpu->arch.pid = hr->pidr;
+	vcpu->arch.cfar = hr->cfar;
+	vcpu->arch.ppr = hr->ppr;
+}
+
+static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr)
+{
+	/* No need to reflect the page fault to L1, we've handled it */
+	vcpu->arch.trap = 0;
+
+	/*
+	 * Since the L2 gprs have already been written back into L1 memory when
+	 * we complete the mmio, store the L1 memory location of the L2 gpr
+	 * being loaded into by the mmio so that the loaded value can be
+	 * written there in kvmppc_complete_mmio_load()
+	 */
+	if (((vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) == KVM_MMIO_REG_GPR)
+	    && (vcpu->mmio_is_write == 0)) {
+		vcpu->arch.nested_io_gpr = (gpa_t) regs_ptr +
+					   offsetof(struct pt_regs,
+						    gpr[vcpu->arch.io_gpr]);
+		vcpu->arch.io_gpr = KVM_MMIO_REG_NESTED_GPR;
+	}
+}
+
+static int kvmhv_read_guest_state_and_regs(struct kvm_vcpu *vcpu,
+					   struct hv_guest_state *l2_hv,
+					   struct pt_regs *l2_regs,
+					   u64 hv_ptr, u64 regs_ptr)
+{
+	int size;
+
+	if (kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv->version,
+				sizeof(l2_hv->version)))
+		return -1;
+
+	if (kvmppc_need_byteswap(vcpu))
+		l2_hv->version = swab64(l2_hv->version);
+
+	size = hv_guest_state_size(l2_hv->version);
+	if (size < 0)
+		return -1;
+
+	return kvm_vcpu_read_guest(vcpu, hv_ptr, l2_hv, size) ||
+		kvm_vcpu_read_guest(vcpu, regs_ptr, l2_regs,
+				    sizeof(struct pt_regs));
+}
+
+static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu,
+					    struct hv_guest_state *l2_hv,
+					    struct pt_regs *l2_regs,
+					    u64 hv_ptr, u64 regs_ptr)
+{
+	int size;
+
+	size = hv_guest_state_size(l2_hv->version);
+	if (size < 0)
+		return -1;
+
+	return kvm_vcpu_write_guest(vcpu, hv_ptr, l2_hv, size) ||
+		kvm_vcpu_write_guest(vcpu, regs_ptr, l2_regs,
+				     sizeof(struct pt_regs));
+}
+
+static void load_l2_hv_regs(struct kvm_vcpu *vcpu,
+			    const struct hv_guest_state *l2_hv,
+			    const struct hv_guest_state *l1_hv, u64 *lpcr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	u64 mask;
+
+	restore_hv_regs(vcpu, l2_hv);
+
+	/*
+	 * Don't let L1 change LPCR bits for the L2 except these:
+	 */
+	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | LPCR_MER;
+
+	/*
+	 * Additional filtering is required depending on hardware
+	 * and configuration.
+	 */
+	*lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm,
+				      (vc->lpcr & ~mask) | (*lpcr & mask));
+
+	/*
+	 * Don't let L1 enable features for L2 which we don't allow for L1,
+	 * but preserve the interrupt cause field.
+	 */
+	vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr_permitted);
+
+	/* Don't let data address watchpoint match in hypervisor state */
+	vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP;
+	vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP;
+
+	/* Don't let completed instruction address breakpt match in HV state */
+	if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+		vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV;
+}
+
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
+{
+	long int err, r;
+	struct kvm_nested_guest *l2;
+	struct pt_regs l2_regs, saved_l1_regs;
+	struct hv_guest_state l2_hv = {0}, saved_l1_hv;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	u64 hv_ptr, regs_ptr;
+	u64 hdec_exp, lpcr;
+	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
+
+	if (vcpu->kvm->arch.l1_ptcr == 0)
+		return H_NOT_AVAILABLE;
+
+	if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
+		return H_BAD_MODE;
+
+	/* copy parameters in */
+	hv_ptr = kvmppc_get_gpr(vcpu, 4);
+	regs_ptr = kvmppc_get_gpr(vcpu, 5);
+	kvm_vcpu_srcu_read_lock(vcpu);
+	err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
+					      hv_ptr, regs_ptr);
+	kvm_vcpu_srcu_read_unlock(vcpu);
+	if (err)
+		return H_PARAMETER;
+
+	if (kvmppc_need_byteswap(vcpu))
+		byteswap_hv_regs(&l2_hv);
+	if (l2_hv.version > HV_GUEST_STATE_VERSION)
+		return H_P2;
+
+	if (kvmppc_need_byteswap(vcpu))
+		byteswap_pt_regs(&l2_regs);
+	if (l2_hv.vcpu_token >= NR_CPUS)
+		return H_PARAMETER;
+
+	/*
+	 * L1 must have set up a suspended state to enter the L2 in a
+	 * transactional state, and only in that case. These have to be
+	 * filtered out here to prevent causing a TM Bad Thing in the
+	 * host HRFID. We could synthesize a TM Bad Thing back to the L1
+	 * here but there doesn't seem like much point.
+	 */
+	if (MSR_TM_SUSPENDED(vcpu->arch.shregs.msr)) {
+		if (!MSR_TM_ACTIVE(l2_regs.msr))
+			return H_BAD_MODE;
+	} else {
+		if (l2_regs.msr & MSR_TS_MASK)
+			return H_BAD_MODE;
+		if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_TS_MASK))
+			return H_BAD_MODE;
+	}
+
+	/* translate lpid */
+	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
+	if (!l2)
+		return H_PARAMETER;
+	if (!l2->l1_gr_to_hr) {
+		mutex_lock(&l2->tlb_lock);
+		kvmhv_update_ptbl_cache(l2);
+		mutex_unlock(&l2->tlb_lock);
+	}
+
+	/* save l1 values of things */
+	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+	saved_l1_regs = vcpu->arch.regs;
+	kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
+
+	/* convert TB values/offsets to host (L0) values */
+	hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
+	vc->tb_offset += l2_hv.tb_offset;
+	vcpu->arch.dec_expires += l2_hv.tb_offset;
+
+	/* set L1 state to L2 state */
+	vcpu->arch.nested = l2;
+	vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+	vcpu->arch.nested_hfscr = l2_hv.hfscr;
+	vcpu->arch.regs = l2_regs;
+
+	/* Guest must always run with ME enabled, HV disabled. */
+	vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV;
+
+	lpcr = l2_hv.lpcr;
+	load_l2_hv_regs(vcpu, &l2_hv, &saved_l1_hv, &lpcr);
+
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+	do {
+		r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr);
+	} while (is_kvmppc_resume_guest(r));
+
+	/* save L2 state for return */
+	l2_regs = vcpu->arch.regs;
+	l2_regs.msr = vcpu->arch.shregs.msr;
+	delta_purr = vcpu->arch.purr - l2_hv.purr;
+	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
+	delta_ic = vcpu->arch.ic - l2_hv.ic;
+	delta_vtb = vc->vtb - l2_hv.vtb;
+	save_hv_return_state(vcpu, &l2_hv);
+
+	/* restore L1 state */
+	vcpu->arch.nested = NULL;
+	vcpu->arch.regs = saved_l1_regs;
+	vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
+	/* set L1 MSR TS field according to L2 transaction state */
+	if (l2_regs.msr & MSR_TS_MASK)
+		vcpu->arch.shregs.msr |= MSR_TS_S;
+	vc->tb_offset = saved_l1_hv.tb_offset;
+	/* XXX: is this always the same delta as saved_l1_hv.tb_offset? */
+	vcpu->arch.dec_expires -= l2_hv.tb_offset;
+	restore_hv_regs(vcpu, &saved_l1_hv);
+	vcpu->arch.purr += delta_purr;
+	vcpu->arch.spurr += delta_spurr;
+	vcpu->arch.ic += delta_ic;
+	vc->vtb += delta_vtb;
+
+	kvmhv_put_nested(l2);
+
+	/* copy l2_hv_state and regs back to guest */
+	if (kvmppc_need_byteswap(vcpu)) {
+		byteswap_hv_regs(&l2_hv);
+		byteswap_pt_regs(&l2_regs);
+	}
+	kvm_vcpu_srcu_read_lock(vcpu);
+	err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
+					       hv_ptr, regs_ptr);
+	kvm_vcpu_srcu_read_unlock(vcpu);
+	if (err)
+		return H_AUTHORITY;
+
+	if (r == -EINTR)
+		return H_INTERRUPT;
+
+	if (vcpu->mmio_needed) {
+		kvmhv_nested_mmio_needed(vcpu, regs_ptr);
+		return H_TOO_HARD;
+	}
+
+	return vcpu->arch.trap;
+}
+
+long kvmhv_nested_init(void)
+{
+	long int ptb_order;
+	unsigned long ptcr;
+	long rc;
+
+	if (!kvmhv_on_pseries())
+		return 0;
+	if (!radix_enabled())
+		return -ENODEV;
+
+	/* Partition table entry is 1<<4 bytes in size, hence the 4. */
+	ptb_order = KVM_MAX_NESTED_GUESTS_SHIFT + 4;
+	/* Minimum partition table size is 1<<12 bytes */
+	if (ptb_order < 12)
+		ptb_order = 12;
+	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+				       GFP_KERNEL);
+	if (!pseries_partition_tb) {
+		pr_err("kvm-hv: failed to allocated nested partition table\n");
+		return -ENOMEM;
+	}
+
+	ptcr = __pa(pseries_partition_tb) | (ptb_order - 12);
+	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+	if (rc != H_SUCCESS) {
+		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+		       rc);
+		kfree(pseries_partition_tb);
+		pseries_partition_tb = NULL;
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+void kvmhv_nested_exit(void)
+{
+	/*
+	 * N.B. the kvmhv_on_pseries() test is there because it enables
+	 * the compiler to remove the call to plpar_hcall_norets()
+	 * when CONFIG_PPC_PSERIES=n.
+	 */
+	if (kvmhv_on_pseries() && pseries_partition_tb) {
+		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+		kfree(pseries_partition_tb);
+		pseries_partition_tb = NULL;
+	}
+}
+
+static void kvmhv_flush_lpid(unsigned int lpid)
+{
+	long rc;
+
+	if (!kvmhv_on_pseries()) {
+		radix__flush_all_lpid(lpid);
+		return;
+	}
+
+	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
+		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
+					lpid, TLBIEL_INVAL_SET_LPID);
+	else
+		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
+					    H_RPTI_TYPE_NESTED |
+					    H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+					    H_RPTI_TYPE_PAT,
+					    H_RPTI_PAGE_ALL, 0, -1UL);
+	if (rc)
+		pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+	if (!kvmhv_on_pseries()) {
+		mmu_partition_table_set_entry(lpid, dw0, dw1, true);
+		return;
+	}
+
+	pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+	pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+	/* L0 will do the necessary barriers */
+	kvmhv_flush_lpid(lpid);
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+	unsigned long dw0;
+
+	dw0 = PATB_HR | radix__get_tree_size() |
+		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+	int srcu_idx;
+	long ret = H_SUCCESS;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	/* Check partition size and base address. */
+	if ((ptcr & PRTS_MASK) + 12 - 4 > KVM_MAX_NESTED_GUESTS_SHIFT ||
+	    !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
+		ret = H_PARAMETER;
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	if (ret == H_SUCCESS)
+		kvm->arch.l1_ptcr = ptcr;
+
+	return ret;
+}
+
+/*
+ * Handle the H_COPY_TOFROM_GUEST hcall.
+ * r4 = L1 lpid of nested guest
+ * r5 = pid
+ * r6 = eaddr to access
+ * r7 = to buffer (L1 gpa)
+ * r8 = from buffer (L1 gpa)
+ * r9 = n bytes to copy
+ */
+long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
+{
+	struct kvm_nested_guest *gp;
+	int l1_lpid = kvmppc_get_gpr(vcpu, 4);
+	int pid = kvmppc_get_gpr(vcpu, 5);
+	gva_t eaddr = kvmppc_get_gpr(vcpu, 6);
+	gpa_t gp_to = (gpa_t) kvmppc_get_gpr(vcpu, 7);
+	gpa_t gp_from = (gpa_t) kvmppc_get_gpr(vcpu, 8);
+	void *buf;
+	unsigned long n = kvmppc_get_gpr(vcpu, 9);
+	bool is_load = !!gp_to;
+	long rc;
+
+	if (gp_to && gp_from) /* One must be NULL to determine the direction */
+		return H_PARAMETER;
+
+	if (eaddr & (0xFFFUL << 52))
+		return H_PARAMETER;
+
+	buf = kzalloc(n, GFP_KERNEL | __GFP_NOWARN);
+	if (!buf)
+		return H_NO_MEM;
+
+	gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false);
+	if (!gp) {
+		rc = H_PARAMETER;
+		goto out_free;
+	}
+
+	mutex_lock(&gp->tlb_lock);
+
+	if (is_load) {
+		/* Load from the nested guest into our buffer */
+		rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid,
+						     eaddr, buf, NULL, n);
+		if (rc)
+			goto not_found;
+
+		/* Write what was loaded into our buffer back to the L1 guest */
+		kvm_vcpu_srcu_read_lock(vcpu);
+		rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n);
+		kvm_vcpu_srcu_read_unlock(vcpu);
+		if (rc)
+			goto not_found;
+	} else {
+		/* Load the data to be stored from the L1 guest into our buf */
+		kvm_vcpu_srcu_read_lock(vcpu);
+		rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n);
+		kvm_vcpu_srcu_read_unlock(vcpu);
+		if (rc)
+			goto not_found;
+
+		/* Store from our buffer into the nested guest */
+		rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid,
+						     eaddr, NULL, buf, n);
+		if (rc)
+			goto not_found;
+	}
+
+out_unlock:
+	mutex_unlock(&gp->tlb_lock);
+	kvmhv_put_nested(gp);
+out_free:
+	kfree(buf);
+	return rc;
+not_found:
+	rc = H_NOT_FOUND;
+	goto out_unlock;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+	int ret;
+	struct patb_entry ptbl_entry;
+	unsigned long ptbl_addr;
+	struct kvm *kvm = gp->l1_host;
+
+	ret = -EFAULT;
+	ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+	if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) {
+		int srcu_idx = srcu_read_lock(&kvm->srcu);
+		ret = kvm_read_guest(kvm, ptbl_addr,
+				     &ptbl_entry, sizeof(ptbl_entry));
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+	}
+	if (ret) {
+		gp->l1_gr_to_hr = 0;
+		gp->process_table = 0;
+	} else {
+		gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+		gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+	}
+	kvmhv_set_nested_ptbl(gp);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+	idr_init(&kvm->arch.kvm_nested_guest_idr);
+}
+
+static struct kvm_nested_guest *__find_nested(struct kvm *kvm, int lpid)
+{
+	return idr_find(&kvm->arch.kvm_nested_guest_idr, lpid);
+}
+
+static bool __prealloc_nested(struct kvm *kvm, int lpid)
+{
+	if (idr_alloc(&kvm->arch.kvm_nested_guest_idr,
+				NULL, lpid, lpid + 1, GFP_KERNEL) != lpid)
+		return false;
+	return true;
+}
+
+static void __add_nested(struct kvm *kvm, int lpid, struct kvm_nested_guest *gp)
+{
+	if (idr_replace(&kvm->arch.kvm_nested_guest_idr, gp, lpid))
+		WARN_ON(1);
+}
+
+static void __remove_nested(struct kvm *kvm, int lpid)
+{
+	idr_remove(&kvm->arch.kvm_nested_guest_idr, lpid);
+}
+
+static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+	struct kvm_nested_guest *gp;
+	long shadow_lpid;
+
+	gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+	if (!gp)
+		return NULL;
+	gp->l1_host = kvm;
+	gp->l1_lpid = lpid;
+	mutex_init(&gp->tlb_lock);
+	gp->shadow_pgtable = pgd_alloc(kvm->mm);
+	if (!gp->shadow_pgtable)
+		goto out_free;
+	shadow_lpid = kvmppc_alloc_lpid();
+	if (shadow_lpid < 0)
+		goto out_free2;
+	gp->shadow_lpid = shadow_lpid;
+	gp->radix = 1;
+
+	memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
+
+	return gp;
+
+ out_free2:
+	pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+	kfree(gp);
+	return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->l1_host;
+
+	if (gp->shadow_pgtable) {
+		/*
+		 * No vcpu is using this struct and no call to
+		 * kvmhv_get_nested can find this struct,
+		 * so we don't need to hold kvm->mmu_lock.
+		 */
+		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+					  gp->shadow_lpid);
+		pgd_free(kvm->mm, gp->shadow_pgtable);
+	}
+	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+	kvmppc_free_lpid(gp->shadow_lpid);
+	kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->l1_host;
+	int lpid = gp->l1_lpid;
+	long ref;
+
+	spin_lock(&kvm->mmu_lock);
+	if (gp == __find_nested(kvm, lpid)) {
+		__remove_nested(kvm, lpid);
+		--gp->refcnt;
+	}
+	ref = gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+	if (ref == 0)
+		kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ * This is called with no vcpus of the guest running, when
+ * switching the guest to HPT mode or when destroying the
+ * guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+	int lpid;
+	struct kvm_nested_guest *gp;
+	struct kvm_nested_guest *freelist = NULL;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx, bkt;
+
+	spin_lock(&kvm->mmu_lock);
+	idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) {
+		__remove_nested(kvm, lpid);
+		if (--gp->refcnt == 0) {
+			gp->next = freelist;
+			freelist = gp;
+		}
+	}
+	idr_destroy(&kvm->arch.kvm_nested_guest_idr);
+	/* idr is empty and may be reused at this point */
+	spin_unlock(&kvm->mmu_lock);
+	while ((gp = freelist) != NULL) {
+		freelist = gp->next;
+		kvmhv_release_nested(gp);
+	}
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	kvm_for_each_memslot(memslot, bkt, kvm_memslots(kvm))
+		kvmhv_free_memslot_nest_rmap(memslot);
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/* caller must hold gp->tlb_lock */
+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->l1_host;
+
+	spin_lock(&kvm->mmu_lock);
+	kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
+	spin_unlock(&kvm->mmu_lock);
+	kvmhv_flush_lpid(gp->shadow_lpid);
+	kvmhv_update_ptbl_cache(gp);
+	if (gp->l1_gr_to_hr == 0)
+		kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+					  bool create)
+{
+	struct kvm_nested_guest *gp, *newgp;
+
+	if (l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+		return NULL;
+
+	spin_lock(&kvm->mmu_lock);
+	gp = __find_nested(kvm, l1_lpid);
+	if (gp)
+		++gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (gp || !create)
+		return gp;
+
+	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+	if (!newgp)
+		return NULL;
+
+	if (!__prealloc_nested(kvm, l1_lpid)) {
+		kvmhv_release_nested(newgp);
+		return NULL;
+	}
+
+	spin_lock(&kvm->mmu_lock);
+	gp = __find_nested(kvm, l1_lpid);
+	if (!gp) {
+		__add_nested(kvm, l1_lpid, newgp);
+		++newgp->refcnt;
+		gp = newgp;
+		newgp = NULL;
+	}
+	++gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (newgp)
+		kvmhv_release_nested(newgp);
+
+	return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = gp->l1_host;
+	long ref;
+
+	spin_lock(&kvm->mmu_lock);
+	ref = --gp->refcnt;
+	spin_unlock(&kvm->mmu_lock);
+	if (ref == 0)
+		kvmhv_release_nested(gp);
+}
+
+pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
+				 unsigned long ea, unsigned *hshift)
+{
+	struct kvm_nested_guest *gp;
+	pte_t *pte;
+
+	gp = __find_nested(kvm, lpid);
+	if (!gp)
+		return NULL;
+
+	VM_WARN(!spin_is_locked(&kvm->mmu_lock),
+		"%s called with kvm mmu_lock not held \n", __func__);
+	pte = __find_linux_pte(gp->shadow_pgtable, ea, NULL, hshift);
+
+	return pte;
+}
+
+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
+{
+	return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
+				       RMAP_NESTED_GPA_MASK));
+}
+
+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+			    struct rmap_nested **n_rmap)
+{
+	struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+	struct rmap_nested *cursor;
+	u64 rmap, new_rmap = (*n_rmap)->rmap;
+
+	/* Are there any existing entries? */
+	if (!(*rmapp)) {
+		/* No -> use the rmap as a single entry */
+		*rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
+		return;
+	}
+
+	/* Do any entries match what we're trying to insert? */
+	for_each_nest_rmap_safe(cursor, entry, &rmap) {
+		if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
+			return;
+	}
+
+	/* Do we need to create a list or just add the new entry? */
+	rmap = *rmapp;
+	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+		*rmapp = 0UL;
+	llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
+	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+		(*n_rmap)->list.next = (struct llist_node *) rmap;
+
+	/* Set NULL so not freed by caller */
+	*n_rmap = NULL;
+}
+
+static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap,
+				      unsigned long clr, unsigned long set,
+				      unsigned long hpa, unsigned long mask)
+{
+	unsigned long gpa;
+	unsigned int shift, lpid;
+	pte_t *ptep;
+
+	gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+	lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+
+	/* Find the pte */
+	ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
+	/*
+	 * If the pte is present and the pfn is still the same, update the pte.
+	 * If the pfn has changed then this is a stale rmap entry, the nested
+	 * gpa actually points somewhere else now, and there is nothing to do.
+	 * XXX A future optimisation would be to remove the rmap entry here.
+	 */
+	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) {
+		__radix_pte_update(ptep, clr, set);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
+	}
+}
+
+/*
+ * For a given list of rmap entries, update the rc bits in all ptes in shadow
+ * page tables for nested guests which are referenced by the rmap list.
+ */
+void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp,
+				    unsigned long clr, unsigned long set,
+				    unsigned long hpa, unsigned long nbytes)
+{
+	struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+	struct rmap_nested *cursor;
+	unsigned long rmap, mask;
+
+	if ((clr | set) & ~(_PAGE_DIRTY | _PAGE_ACCESSED))
+		return;
+
+	mask = PTE_RPN_MASK & ~(nbytes - 1);
+	hpa &= mask;
+
+	for_each_nest_rmap_safe(cursor, entry, &rmap)
+		kvmhv_update_nest_rmap_rc(kvm, rmap, clr, set, hpa, mask);
+}
+
+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
+				   unsigned long hpa, unsigned long mask)
+{
+	struct kvm_nested_guest *gp;
+	unsigned long gpa;
+	unsigned int shift, lpid;
+	pte_t *ptep;
+
+	gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+	lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+	gp = __find_nested(kvm, lpid);
+	if (!gp)
+		return;
+
+	/* Find and invalidate the pte */
+	ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
+	/* Don't spuriously invalidate ptes if the pfn has changed */
+	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+}
+
+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
+					unsigned long hpa, unsigned long mask)
+{
+	struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
+	struct rmap_nested *cursor;
+	unsigned long rmap;
+
+	for_each_nest_rmap_safe(cursor, entry, &rmap) {
+		kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
+		kfree(cursor);
+	}
+}
+
+/* called with kvm->mmu_lock held */
+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+				  const struct kvm_memory_slot *memslot,
+				  unsigned long gpa, unsigned long hpa,
+				  unsigned long nbytes)
+{
+	unsigned long gfn, end_gfn;
+	unsigned long addr_mask;
+
+	if (!memslot)
+		return;
+	gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
+	end_gfn = gfn + (nbytes >> PAGE_SHIFT);
+
+	addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
+	hpa &= addr_mask;
+
+	for (; gfn < end_gfn; gfn++) {
+		unsigned long *rmap = &memslot->arch.rmap[gfn];
+		kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
+	}
+}
+
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
+{
+	unsigned long page;
+
+	for (page = 0; page < free->npages; page++) {
+		unsigned long rmap, *rmapp = &free->arch.rmap[page];
+		struct rmap_nested *cursor;
+		struct llist_node *entry;
+
+		entry = llist_del_all((struct llist_head *) rmapp);
+		for_each_nest_rmap_safe(cursor, entry, &rmap)
+			kfree(cursor);
+	}
+}
+
+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
+					struct kvm_nested_guest *gp,
+					long gpa, int *shift_ret)
+{
+	struct kvm *kvm = vcpu->kvm;
+	bool ret = false;
+	pte_t *ptep;
+	int shift;
+
+	spin_lock(&kvm->mmu_lock);
+	ptep = find_kvm_nested_guest_pte(kvm, gp->l1_lpid, gpa, &shift);
+	if (!shift)
+		shift = PAGE_SHIFT;
+	if (ptep && pte_present(*ptep)) {
+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+		ret = true;
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	if (shift_ret)
+		*shift_ret = shift;
+	return ret;
+}
+
+static inline int get_ric(unsigned int instr)
+{
+	return (instr >> 18) & 0x3;
+}
+
+static inline int get_prs(unsigned int instr)
+{
+	return (instr >> 17) & 0x1;
+}
+
+static inline int get_r(unsigned int instr)
+{
+	return (instr >> 16) & 0x1;
+}
+
+static inline int get_lpid(unsigned long r_val)
+{
+	return r_val & 0xffffffff;
+}
+
+static inline int get_is(unsigned long r_val)
+{
+	return (r_val >> 10) & 0x3;
+}
+
+static inline int get_ap(unsigned long r_val)
+{
+	return (r_val >> 5) & 0x7;
+}
+
+static inline long get_epn(unsigned long r_val)
+{
+	return r_val >> 12;
+}
+
+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
+					int ap, long epn)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *gp;
+	long npages;
+	int shift, shadow_shift;
+	unsigned long addr;
+
+	shift = ap_to_shift(ap);
+	addr = epn << 12;
+	if (shift < 0)
+		/* Invalid ap encoding */
+		return -EINVAL;
+
+	addr &= ~((1UL << shift) - 1);
+	npages = 1UL << (shift - PAGE_SHIFT);
+
+	gp = kvmhv_get_nested(kvm, lpid, false);
+	if (!gp) /* No such guest -> nothing to do */
+		return 0;
+	mutex_lock(&gp->tlb_lock);
+
+	/* There may be more than one host page backing this single guest pte */
+	do {
+		kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
+
+		npages -= 1UL << (shadow_shift - PAGE_SHIFT);
+		addr += 1UL << shadow_shift;
+	} while (npages > 0);
+
+	mutex_unlock(&gp->tlb_lock);
+	kvmhv_put_nested(gp);
+	return 0;
+}
+
+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_guest *gp, int ric)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	mutex_lock(&gp->tlb_lock);
+	switch (ric) {
+	case 0:
+		/* Invalidate TLB */
+		spin_lock(&kvm->mmu_lock);
+		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+					  gp->shadow_lpid);
+		kvmhv_flush_lpid(gp->shadow_lpid);
+		spin_unlock(&kvm->mmu_lock);
+		break;
+	case 1:
+		/*
+		 * Invalidate PWC
+		 * We don't cache this -> nothing to do
+		 */
+		break;
+	case 2:
+		/* Invalidate TLB, PWC and caching of partition table entries */
+		kvmhv_flush_nested(gp);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&gp->tlb_lock);
+}
+
+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *gp;
+	int lpid;
+
+	spin_lock(&kvm->mmu_lock);
+	idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) {
+		spin_unlock(&kvm->mmu_lock);
+		kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+		spin_lock(&kvm->mmu_lock);
+	}
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
+				    unsigned long rsval, unsigned long rbval)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *gp;
+	int r, ric, prs, is, ap;
+	int lpid;
+	long epn;
+	int ret = 0;
+
+	ric = get_ric(instr);
+	prs = get_prs(instr);
+	r = get_r(instr);
+	lpid = get_lpid(rsval);
+	is = get_is(rbval);
+
+	/*
+	 * These cases are invalid and are not handled:
+	 * r   != 1 -> Only radix supported
+	 * prs == 1 -> Not HV privileged
+	 * ric == 3 -> No cluster bombs for radix
+	 * is  == 1 -> Partition scoped translations not associated with pid
+	 * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
+	 */
+	if ((!r) || (prs) || (ric == 3) || (is == 1) ||
+	    ((!is) && (ric == 1 || ric == 2)))
+		return -EINVAL;
+
+	switch (is) {
+	case 0:
+		/*
+		 * We know ric == 0
+		 * Invalidate TLB for a given target address
+		 */
+		epn = get_epn(rbval);
+		ap = get_ap(rbval);
+		ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
+		break;
+	case 2:
+		/* Invalidate matching LPID */
+		gp = kvmhv_get_nested(kvm, lpid, false);
+		if (gp) {
+			kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+			kvmhv_put_nested(gp);
+		}
+		break;
+	case 3:
+		/* Invalidate ALL LPIDs */
+		kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * This handles the H_TLB_INVALIDATE hcall.
+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
+ * (r6) rB contents.
+ */
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
+			kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
+	if (ret)
+		return H_PARAMETER;
+	return H_SUCCESS;
+}
+
+static long do_tlb_invalidate_nested_all(struct kvm_vcpu *vcpu,
+					 unsigned long lpid, unsigned long ric)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *gp;
+
+	gp = kvmhv_get_nested(kvm, lpid, false);
+	if (gp) {
+		kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+		kvmhv_put_nested(gp);
+	}
+	return H_SUCCESS;
+}
+
+/*
+ * Number of pages above which we invalidate the entire LPID rather than
+ * flush individual pages.
+ */
+static unsigned long tlb_range_flush_page_ceiling __read_mostly = 33;
+
+static long do_tlb_invalidate_nested_tlb(struct kvm_vcpu *vcpu,
+					 unsigned long lpid,
+					 unsigned long pg_sizes,
+					 unsigned long start,
+					 unsigned long end)
+{
+	int ret = H_P4;
+	unsigned long addr, nr_pages;
+	struct mmu_psize_def *def;
+	unsigned long psize, ap, page_size;
+	bool flush_lpid;
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+		def = &mmu_psize_defs[psize];
+		if (!(pg_sizes & def->h_rpt_pgsize))
+			continue;
+
+		nr_pages = (end - start) >> def->shift;
+		flush_lpid = nr_pages > tlb_range_flush_page_ceiling;
+		if (flush_lpid)
+			return do_tlb_invalidate_nested_all(vcpu, lpid,
+							RIC_FLUSH_TLB);
+		addr = start;
+		ap = mmu_get_ap(psize);
+		page_size = 1UL << def->shift;
+		do {
+			ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap,
+						   get_epn(addr));
+			if (ret)
+				return H_P4;
+			addr += page_size;
+		} while (addr < end);
+	}
+	return ret;
+}
+
+/*
+ * Performs partition-scoped invalidations for nested guests
+ * as part of H_RPT_INVALIDATE hcall.
+ */
+long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid,
+			     unsigned long type, unsigned long pg_sizes,
+			     unsigned long start, unsigned long end)
+{
+	/*
+	 * If L2 lpid isn't valid, we need to return H_PARAMETER.
+	 *
+	 * However, nested KVM issues a L2 lpid flush call when creating
+	 * partition table entries for L2. This happens even before the
+	 * corresponding shadow lpid is created in HV which happens in
+	 * H_ENTER_NESTED call. Since we can't differentiate this case from
+	 * the invalid case, we ignore such flush requests and return success.
+	 */
+	if (!__find_nested(vcpu->kvm, lpid))
+		return H_SUCCESS;
+
+	/*
+	 * A flush all request can be handled by a full lpid flush only.
+	 */
+	if ((type & H_RPTI_TYPE_NESTED_ALL) == H_RPTI_TYPE_NESTED_ALL)
+		return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_ALL);
+
+	/*
+	 * We don't need to handle a PWC flush like process table here,
+	 * because intermediate partition scoped table in nested guest doesn't
+	 * really have PWC. Only level we have PWC is in L0 and for nested
+	 * invalidate at L0 we always do kvm_flush_lpid() which does
+	 * radix__flush_all_lpid(). For range invalidate at any level, we
+	 * are not removing the higher level page tables and hence there is
+	 * no PWC invalidate needed.
+	 *
+	 * if (type & H_RPTI_TYPE_PWC) {
+	 *	ret = do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_PWC);
+	 *	if (ret)
+	 *		return H_P4;
+	 * }
+	 */
+
+	if (start == 0 && end == -1)
+		return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_TLB);
+
+	if (type & H_RPTI_TYPE_TLB)
+		return do_tlb_invalidate_nested_tlb(vcpu, lpid, pg_sizes,
+						    start, end);
+	return H_SUCCESS;
+}
+
+/* Used to convert a nested guest real address to a L1 guest real address */
+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
+				       struct kvm_nested_guest *gp,
+				       unsigned long n_gpa, unsigned long dsisr,
+				       struct kvmppc_pte *gpte_p)
+{
+	u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
+	int ret;
+
+	ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
+					 &fault_addr);
+
+	if (ret) {
+		/* We didn't find a pte */
+		if (ret == -EINVAL) {
+			/* Unsupported mmu config */
+			flags |= DSISR_UNSUPP_MMU;
+		} else if (ret == -ENOENT) {
+			/* No translation found */
+			flags |= DSISR_NOHPTE;
+		} else if (ret == -EFAULT) {
+			/* Couldn't access L1 real address */
+			flags |= DSISR_PRTABLE_FAULT;
+			vcpu->arch.fault_gpa = fault_addr;
+		} else {
+			/* Unknown error */
+			return ret;
+		}
+		goto forward_to_l1;
+	} else {
+		/* We found a pte -> check permissions */
+		if (dsisr & DSISR_ISSTORE) {
+			/* Can we write? */
+			if (!gpte_p->may_write) {
+				flags |= DSISR_PROTFAULT;
+				goto forward_to_l1;
+			}
+		} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+			/* Can we execute? */
+			if (!gpte_p->may_execute) {
+				flags |= SRR1_ISI_N_G_OR_CIP;
+				goto forward_to_l1;
+			}
+		} else {
+			/* Can we read? */
+			if (!gpte_p->may_read && !gpte_p->may_write) {
+				flags |= DSISR_PROTFAULT;
+				goto forward_to_l1;
+			}
+		}
+	}
+
+	return 0;
+
+forward_to_l1:
+	vcpu->arch.fault_dsisr = flags;
+	if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+		vcpu->arch.shregs.msr &= SRR1_MSR_BITS;
+		vcpu->arch.shregs.msr |= flags;
+	}
+	return RESUME_HOST;
+}
+
+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
+				       struct kvm_nested_guest *gp,
+				       unsigned long n_gpa,
+				       struct kvmppc_pte gpte,
+				       unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	bool writing = !!(dsisr & DSISR_ISSTORE);
+	u64 pgflags;
+	long ret;
+
+	/* Are the rc bits set in the L1 partition scoped pte? */
+	pgflags = _PAGE_ACCESSED;
+	if (writing)
+		pgflags |= _PAGE_DIRTY;
+	if (pgflags & ~gpte.rc)
+		return RESUME_HOST;
+
+	spin_lock(&kvm->mmu_lock);
+	/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
+	ret = kvmppc_hv_handle_set_rc(kvm, false, writing,
+				      gpte.raddr, kvm->arch.lpid);
+	if (!ret) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
+	ret = kvmppc_hv_handle_set_rc(kvm, true, writing,
+				      n_gpa, gp->l1_lpid);
+	if (!ret)
+		ret = -EINVAL;
+	else
+		ret = 0;
+
+out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	return ret;
+}
+
+static inline int kvmppc_radix_level_to_shift(int level)
+{
+	switch (level) {
+	case 2:
+		return PUD_SHIFT;
+	case 1:
+		return PMD_SHIFT;
+	default:
+		return PAGE_SHIFT;
+	}
+}
+
+static inline int kvmppc_radix_shift_to_level(int shift)
+{
+	if (shift == PUD_SHIFT)
+		return 2;
+	if (shift == PMD_SHIFT)
+		return 1;
+	if (shift == PAGE_SHIFT)
+		return 0;
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+/* called with gp->tlb_lock held */
+static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
+					  struct kvm_nested_guest *gp)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_memory_slot *memslot;
+	struct rmap_nested *n_rmap;
+	struct kvmppc_pte gpte;
+	pte_t pte, *pte_p;
+	unsigned long mmu_seq;
+	unsigned long dsisr = vcpu->arch.fault_dsisr;
+	unsigned long ea = vcpu->arch.fault_dar;
+	unsigned long *rmapp;
+	unsigned long n_gpa, gpa, gfn, perm = 0UL;
+	unsigned int shift, l1_shift, level;
+	bool writing = !!(dsisr & DSISR_ISSTORE);
+	bool kvm_ro = false;
+	long int ret;
+
+	if (!gp->l1_gr_to_hr) {
+		kvmhv_update_ptbl_cache(gp);
+		if (!gp->l1_gr_to_hr)
+			return RESUME_HOST;
+	}
+
+	/* Convert the nested guest real address into a L1 guest real address */
+
+	n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
+	if (!(dsisr & DSISR_PRTABLE_FAULT))
+		n_gpa |= ea & 0xFFF;
+	ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
+
+	/*
+	 * If the hardware found a translation but we don't now have a usable
+	 * translation in the l1 partition-scoped tree, remove the shadow pte
+	 * and let the guest retry.
+	 */
+	if (ret == RESUME_HOST &&
+	    (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
+		      DSISR_BAD_COPYPASTE)))
+		goto inval;
+	if (ret)
+		return ret;
+
+	/* Failed to set the reference/change bits */
+	if (dsisr & DSISR_SET_RC) {
+		ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
+		if (ret == RESUME_HOST)
+			return ret;
+		if (ret)
+			goto inval;
+		dsisr &= ~DSISR_SET_RC;
+		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+			       DSISR_PROTFAULT)))
+			return RESUME_GUEST;
+	}
+
+	/*
+	 * We took an HISI or HDSI while we were running a nested guest which
+	 * means we have no partition scoped translation for that. This means
+	 * we need to insert a pte for the mapping into our shadow_pgtable.
+	 */
+
+	l1_shift = gpte.page_shift;
+	if (l1_shift < PAGE_SHIFT) {
+		/* We don't support l1 using a page size smaller than our own */
+		pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
+			l1_shift, PAGE_SHIFT);
+		return -EINVAL;
+	}
+	gpa = gpte.raddr;
+	gfn = gpa >> PAGE_SHIFT;
+
+	/* 1. Get the corresponding host memslot */
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
+			/* unusual error -> reflect to the guest as a DSI */
+			kvmppc_core_queue_data_storage(vcpu,
+					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+					ea, dsisr);
+			return RESUME_GUEST;
+		}
+
+		/* passthrough of emulated MMIO case */
+		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
+	}
+	if (memslot->flags & KVM_MEM_READONLY) {
+		if (writing) {
+			/* Give the guest a DSI */
+			kvmppc_core_queue_data_storage(vcpu,
+					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+					ea, DSISR_ISSTORE | DSISR_PROTFAULT);
+			return RESUME_GUEST;
+		}
+		kvm_ro = true;
+	}
+
+	/* 2. Find the host pte for this L1 guest real address */
+
+	/* Used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	/* See if can find translation in our partition scoped tables for L1 */
+	pte = __pte(0);
+	spin_lock(&kvm->mmu_lock);
+	pte_p = find_kvm_secondary_pte(kvm, gpa, &shift);
+	if (!shift)
+		shift = PAGE_SHIFT;
+	if (pte_p)
+		pte = *pte_p;
+	spin_unlock(&kvm->mmu_lock);
+
+	if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
+		/* No suitable pte found -> try to insert a mapping */
+		ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
+					writing, kvm_ro, &pte, &level);
+		if (ret == -EAGAIN)
+			return RESUME_GUEST;
+		else if (ret)
+			return ret;
+		shift = kvmppc_radix_level_to_shift(level);
+	}
+	/* Align gfn to the start of the page */
+	gfn = (gpa & ~((1UL << shift) - 1)) >> PAGE_SHIFT;
+
+	/* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
+
+	/* The permissions is the combination of the host and l1 guest ptes */
+	perm |= gpte.may_read ? 0UL : _PAGE_READ;
+	perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
+	perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
+	/* Only set accessed/dirty (rc) bits if set in host and l1 guest ptes */
+	perm |= (gpte.rc & _PAGE_ACCESSED) ? 0UL : _PAGE_ACCESSED;
+	perm |= ((gpte.rc & _PAGE_DIRTY) && writing) ? 0UL : _PAGE_DIRTY;
+	pte = __pte(pte_val(pte) & ~perm);
+
+	/* What size pte can we insert? */
+	if (shift > l1_shift) {
+		u64 mask;
+		unsigned int actual_shift = PAGE_SHIFT;
+		if (PMD_SHIFT < l1_shift)
+			actual_shift = PMD_SHIFT;
+		mask = (1UL << shift) - (1UL << actual_shift);
+		pte = __pte(pte_val(pte) | (gpa & mask));
+		shift = actual_shift;
+	}
+	level = kvmppc_radix_shift_to_level(shift);
+	n_gpa &= ~((1UL << shift) - 1);
+
+	/* 4. Insert the pte into our shadow_pgtable */
+
+	n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
+	if (!n_rmap)
+		return RESUME_GUEST; /* Let the guest try again */
+	n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
+		(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
+				mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
+	kfree(n_rmap);
+	if (ret == -EAGAIN)
+		ret = RESUME_GUEST;	/* Let the guest try again */
+
+	return ret;
+
+ inval:
+	kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
+	return RESUME_GUEST;
+}
+
+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
+{
+	struct kvm_nested_guest *gp = vcpu->arch.nested;
+	long int ret;
+
+	mutex_lock(&gp->tlb_lock);
+	ret = __kvmhv_nested_page_fault(vcpu, gp);
+	mutex_unlock(&gp->tlb_lock);
+	return ret;
+}
+
+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
+{
+	int ret = lpid + 1;
+
+	spin_lock(&kvm->mmu_lock);
+	if (!idr_get_next(&kvm->arch.kvm_nested_guest_idr, &ret))
+		ret = -1;
+	spin_unlock(&kvm->mmu_lock);
+
+	return ret;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
new file mode 100644
index 0000000000..34f1db2128
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <asm/asm-prototypes.h>
+#include <asm/dbell.h>
+#include <asm/ppc-opcode.h>
+
+#include "book3s_hv.h"
+
+static void load_spr_state(struct kvm_vcpu *vcpu,
+				struct p9_host_os_sprs *host_os_sprs)
+{
+	/* TAR is very fast */
+	mtspr(SPRN_TAR, vcpu->arch.tar);
+
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
+	    current->thread.vrsave != vcpu->arch.vrsave)
+		mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+#endif
+
+	if (vcpu->arch.hfscr & HFSCR_EBB) {
+		if (current->thread.ebbhr != vcpu->arch.ebbhr)
+			mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+		if (current->thread.ebbrr != vcpu->arch.ebbrr)
+			mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+		if (current->thread.bescr != vcpu->arch.bescr)
+			mtspr(SPRN_BESCR, vcpu->arch.bescr);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TIDR) &&
+			current->thread.tidr != vcpu->arch.tid)
+		mtspr(SPRN_TIDR, vcpu->arch.tid);
+	if (host_os_sprs->iamr != vcpu->arch.iamr)
+		mtspr(SPRN_IAMR, vcpu->arch.iamr);
+	if (host_os_sprs->amr != vcpu->arch.amr)
+		mtspr(SPRN_AMR, vcpu->arch.amr);
+	if (vcpu->arch.uamor != 0)
+		mtspr(SPRN_UAMOR, vcpu->arch.uamor);
+	if (current->thread.fscr != vcpu->arch.fscr)
+		mtspr(SPRN_FSCR, vcpu->arch.fscr);
+	if (current->thread.dscr != vcpu->arch.dscr)
+		mtspr(SPRN_DSCR, vcpu->arch.dscr);
+	if (vcpu->arch.pspb != 0)
+		mtspr(SPRN_PSPB, vcpu->arch.pspb);
+
+	/*
+	 * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
+	 * clear (or hstate set appropriately to catch those registers
+	 * being clobbered if we take a MCE or SRESET), so those are done
+	 * later.
+	 */
+
+	if (!(vcpu->arch.ctrl & 1))
+		mtspr(SPRN_CTRLT, 0);
+}
+
+static void store_spr_state(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.tar = mfspr(SPRN_TAR);
+
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
+#endif
+
+	if (vcpu->arch.hfscr & HFSCR_EBB) {
+		vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+		vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+		vcpu->arch.bescr = mfspr(SPRN_BESCR);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TIDR))
+		vcpu->arch.tid = mfspr(SPRN_TIDR);
+	vcpu->arch.iamr = mfspr(SPRN_IAMR);
+	vcpu->arch.amr = mfspr(SPRN_AMR);
+	vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+	vcpu->arch.fscr = mfspr(SPRN_FSCR);
+	vcpu->arch.dscr = mfspr(SPRN_DSCR);
+	vcpu->arch.pspb = mfspr(SPRN_PSPB);
+
+	vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
+}
+
+/* Returns true if current MSR and/or guest MSR may have changed */
+bool load_vcpu_state(struct kvm_vcpu *vcpu,
+		     struct p9_host_os_sprs *host_os_sprs)
+{
+	bool ret = false;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (cpu_has_feature(CPU_FTR_TM) ||
+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+		unsigned long guest_msr = vcpu->arch.shregs.msr;
+		if (MSR_TM_ACTIVE(guest_msr)) {
+			kvmppc_restore_tm_hv(vcpu, guest_msr, true);
+			ret = true;
+		} else if (vcpu->arch.hfscr & HFSCR_TM) {
+			mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+			mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
+			mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+		}
+	}
+#endif
+
+	load_spr_state(vcpu, host_os_sprs);
+
+	load_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+	load_vr_state(&vcpu->arch.vr);
+#endif
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(load_vcpu_state);
+
+void store_vcpu_state(struct kvm_vcpu *vcpu)
+{
+	store_spr_state(vcpu);
+
+	store_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+	store_vr_state(&vcpu->arch.vr);
+#endif
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (cpu_has_feature(CPU_FTR_TM) ||
+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+		unsigned long guest_msr = vcpu->arch.shregs.msr;
+		if (MSR_TM_ACTIVE(guest_msr)) {
+			kvmppc_save_tm_hv(vcpu, guest_msr, true);
+		} else if (vcpu->arch.hfscr & HFSCR_TM) {
+			vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+			vcpu->arch.tfhar = mfspr(SPRN_TFHAR);
+			vcpu->arch.tfiar = mfspr(SPRN_TFIAR);
+
+			if (!vcpu->arch.nested) {
+				vcpu->arch.load_tm++; /* see load_ebb comment */
+				if (!vcpu->arch.load_tm)
+					vcpu->arch.hfscr &= ~HFSCR_TM;
+			}
+		}
+	}
+#endif
+}
+EXPORT_SYMBOL_GPL(store_vcpu_state);
+
+void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
+{
+	host_os_sprs->iamr = mfspr(SPRN_IAMR);
+	host_os_sprs->amr = mfspr(SPRN_AMR);
+}
+EXPORT_SYMBOL_GPL(save_p9_host_os_sprs);
+
+/* vcpu guest regs must already be saved */
+void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
+			     struct p9_host_os_sprs *host_os_sprs)
+{
+	/*
+	 * current->thread.xxx registers must all be restored to host
+	 * values before a potential context switch, otherwise the context
+	 * switch itself will overwrite current->thread.xxx with the values
+	 * from the guest SPRs.
+	 */
+
+	mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
+
+	if (cpu_has_feature(CPU_FTR_P9_TIDR) &&
+			current->thread.tidr != vcpu->arch.tid)
+		mtspr(SPRN_TIDR, current->thread.tidr);
+	if (host_os_sprs->iamr != vcpu->arch.iamr)
+		mtspr(SPRN_IAMR, host_os_sprs->iamr);
+	if (vcpu->arch.uamor != 0)
+		mtspr(SPRN_UAMOR, 0);
+	if (host_os_sprs->amr != vcpu->arch.amr)
+		mtspr(SPRN_AMR, host_os_sprs->amr);
+	if (current->thread.fscr != vcpu->arch.fscr)
+		mtspr(SPRN_FSCR, current->thread.fscr);
+	if (current->thread.dscr != vcpu->arch.dscr)
+		mtspr(SPRN_DSCR, current->thread.dscr);
+	if (vcpu->arch.pspb != 0)
+		mtspr(SPRN_PSPB, 0);
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	if (!(vcpu->arch.ctrl & 1))
+		mtspr(SPRN_CTRLT, 1);
+
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
+	    vcpu->arch.vrsave != current->thread.vrsave)
+		mtspr(SPRN_VRSAVE, current->thread.vrsave);
+#endif
+	if (vcpu->arch.hfscr & HFSCR_EBB) {
+		if (vcpu->arch.bescr != current->thread.bescr)
+			mtspr(SPRN_BESCR, current->thread.bescr);
+		if (vcpu->arch.ebbhr != current->thread.ebbhr)
+			mtspr(SPRN_EBBHR, current->thread.ebbhr);
+		if (vcpu->arch.ebbrr != current->thread.ebbrr)
+			mtspr(SPRN_EBBRR, current->thread.ebbrr);
+
+		if (!vcpu->arch.nested) {
+			/*
+			 * This is like load_fp in context switching, turn off
+			 * the facility after it wraps the u8 to try avoiding
+			 * saving and restoring the registers each partition
+			 * switch.
+			 */
+			vcpu->arch.load_ebb++;
+			if (!vcpu->arch.load_ebb)
+				vcpu->arch.hfscr &= ~HFSCR_EBB;
+		}
+	}
+
+	if (vcpu->arch.tar != current->thread.tar)
+		mtspr(SPRN_TAR, current->thread.tar);
+}
+EXPORT_SYMBOL_GPL(restore_p9_host_os_sprs);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING
+void accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	struct kvmhv_tb_accumulator *curr;
+	u64 tb = mftb() - vc->tb_offset_applied;
+	u64 prev_tb;
+	u64 delta;
+	u64 seq;
+
+	curr = vcpu->arch.cur_activity;
+	vcpu->arch.cur_activity = next;
+	prev_tb = vcpu->arch.cur_tb_start;
+	vcpu->arch.cur_tb_start = tb;
+
+	if (!curr)
+		return;
+
+	delta = tb - prev_tb;
+
+	seq = curr->seqcount;
+	curr->seqcount = seq + 1;
+	smp_wmb();
+	curr->tb_total += delta;
+	if (seq == 0 || delta < curr->tb_min)
+		curr->tb_min = delta;
+	if (delta > curr->tb_max)
+		curr->tb_max = delta;
+	smp_wmb();
+	curr->seqcount = seq + 2;
+}
+EXPORT_SYMBOL_GPL(accumulate_time);
+#endif
+
+static inline u64 mfslbv(unsigned int idx)
+{
+	u64 slbev;
+
+	asm volatile("slbmfev  %0,%1" : "=r" (slbev) : "r" (idx));
+
+	return slbev;
+}
+
+static inline u64 mfslbe(unsigned int idx)
+{
+	u64 slbee;
+
+	asm volatile("slbmfee  %0,%1" : "=r" (slbee) : "r" (idx));
+
+	return slbee;
+}
+
+static inline void mtslb(u64 slbee, u64 slbev)
+{
+	asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
+}
+
+static inline void clear_slb_entry(unsigned int idx)
+{
+	mtslb(idx, 0);
+}
+
+static inline void slb_clear_invalidate_partition(void)
+{
+	clear_slb_entry(0);
+	asm volatile(PPC_SLBIA(6));
+}
+
+/*
+ * Malicious or buggy radix guests may have inserted SLB entries
+ * (only 0..3 because radix always runs with UPRT=1), so these must
+ * be cleared here to avoid side-channels. slbmte is used rather
+ * than slbia, as it won't clear cached translations.
+ */
+static void radix_clear_slb(void)
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		clear_slb_entry(i);
+}
+
+static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
+{
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	u32 lpid;
+	u32 pid;
+
+	lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
+	pid = vcpu->arch.pid;
+
+	/*
+	 * Prior memory accesses to host PID Q3 must be completed before we
+	 * start switching, and stores must be drained to avoid not-my-LPAR
+	 * logic (see switch_mmu_to_host).
+	 */
+	asm volatile("hwsync" ::: "memory");
+	isync();
+	mtspr(SPRN_LPID, lpid);
+	mtspr(SPRN_LPCR, lpcr);
+	mtspr(SPRN_PID, pid);
+	/*
+	 * isync not required here because we are HRFID'ing to guest before
+	 * any guest context access, which is context synchronising.
+	 */
+}
+
+static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
+{
+	u32 lpid;
+	u32 pid;
+	int i;
+
+	lpid = kvm->arch.lpid;
+	pid = vcpu->arch.pid;
+
+	/*
+	 * See switch_mmu_to_guest_radix. ptesync should not be required here
+	 * even if the host is in HPT mode because speculative accesses would
+	 * not cause RC updates (we are in real mode).
+	 */
+	asm volatile("hwsync" ::: "memory");
+	isync();
+	mtspr(SPRN_LPID, lpid);
+	mtspr(SPRN_LPCR, lpcr);
+	mtspr(SPRN_PID, pid);
+
+	for (i = 0; i < vcpu->arch.slb_max; i++)
+		mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv);
+	/*
+	 * isync not required here, see switch_mmu_to_guest_radix.
+	 */
+}
+
+static void switch_mmu_to_host(struct kvm *kvm, u32 pid)
+{
+	u32 lpid = kvm->arch.host_lpid;
+	u64 lpcr = kvm->arch.host_lpcr;
+
+	/*
+	 * The guest has exited, so guest MMU context is no longer being
+	 * non-speculatively accessed, but a hwsync is needed before the
+	 * mtLPIDR / mtPIDR switch, in order to ensure all stores are drained,
+	 * so the not-my-LPAR tlbie logic does not overlook them.
+	 */
+	asm volatile("hwsync" ::: "memory");
+	isync();
+	mtspr(SPRN_PID, pid);
+	mtspr(SPRN_LPID, lpid);
+	mtspr(SPRN_LPCR, lpcr);
+	/*
+	 * isync is not required after the switch, because mtmsrd with L=0
+	 * is performed after this switch, which is context synchronising.
+	 */
+
+	if (!radix_enabled())
+		slb_restore_bolted_realmode();
+}
+
+static void save_clear_host_mmu(struct kvm *kvm)
+{
+	if (!radix_enabled()) {
+		/*
+		 * Hash host could save and restore host SLB entries to
+		 * reduce SLB fault overheads of VM exits, but for now the
+		 * existing code clears all entries and restores just the
+		 * bolted ones when switching back to host.
+		 */
+		slb_clear_invalidate_partition();
+	}
+}
+
+static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+	if (kvm_is_radix(kvm)) {
+		radix_clear_slb();
+	} else {
+		int i;
+		int nr = 0;
+
+		/*
+		 * This must run before switching to host (radix host can't
+		 * access all SLBs).
+		 */
+		for (i = 0; i < vcpu->arch.slb_nr; i++) {
+			u64 slbee, slbev;
+
+			slbee = mfslbe(i);
+			if (slbee & SLB_ESID_V) {
+				slbev = mfslbv(i);
+				vcpu->arch.slb[nr].orige = slbee | i;
+				vcpu->arch.slb[nr].origv = slbev;
+				nr++;
+			}
+		}
+		vcpu->arch.slb_max = nr;
+		slb_clear_invalidate_partition();
+	}
+}
+
+static void flush_guest_tlb(struct kvm *kvm)
+{
+	unsigned long rb, set;
+
+	rb = PPC_BIT(52);	/* IS = 2 */
+	if (kvm_is_radix(kvm)) {
+		/* R=1 PRS=1 RIC=2 */
+		asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+			     : : "r" (rb), "i" (1), "i" (1), "i" (2),
+			       "r" (0) : "memory");
+		for (set = 1; set < kvm->arch.tlb_sets; ++set) {
+			rb += PPC_BIT(51);	/* increment set number */
+			/* R=1 PRS=1 RIC=0 */
+			asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+				     : : "r" (rb), "i" (1), "i" (1), "i" (0),
+				       "r" (0) : "memory");
+		}
+		asm volatile("ptesync": : :"memory");
+		// POWER9 congruence-class TLBIEL leaves ERAT. Flush it now.
+		asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory");
+	} else {
+		for (set = 0; set < kvm->arch.tlb_sets; ++set) {
+			/* R=0 PRS=0 RIC=0 */
+			asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+				     : : "r" (rb), "i" (0), "i" (0), "i" (0),
+				       "r" (0) : "memory");
+			rb += PPC_BIT(51);	/* increment set number */
+		}
+		asm volatile("ptesync": : :"memory");
+		// POWER9 congruence-class TLBIEL leaves ERAT. Flush it now.
+		asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory");
+	}
+}
+
+static void check_need_tlb_flush(struct kvm *kvm, int pcpu,
+				 struct kvm_nested_guest *nested)
+{
+	cpumask_t *need_tlb_flush;
+	bool all_set = true;
+	int i;
+
+	if (nested)
+		need_tlb_flush = &nested->need_tlb_flush;
+	else
+		need_tlb_flush = &kvm->arch.need_tlb_flush;
+
+	if (likely(!cpumask_test_cpu(pcpu, need_tlb_flush)))
+		return;
+
+	/*
+	 * Individual threads can come in here, but the TLB is shared between
+	 * the 4 threads in a core, hence invalidating on one thread
+	 * invalidates for all, so only invalidate the first time (if all bits
+	 * were set.  The others must still execute a ptesync.
+	 *
+	 * If a race occurs and two threads do the TLB flush, that is not a
+	 * problem, just sub-optimal.
+	 */
+	for (i = cpu_first_tlb_thread_sibling(pcpu);
+			i <= cpu_last_tlb_thread_sibling(pcpu);
+			i += cpu_tlb_thread_sibling_step()) {
+		if (!cpumask_test_cpu(i, need_tlb_flush)) {
+			all_set = false;
+			break;
+		}
+	}
+	if (all_set)
+		flush_guest_tlb(kvm);
+	else
+		asm volatile("ptesync" ::: "memory");
+
+	/* Clear the bit after the TLB flush */
+	cpumask_clear_cpu(pcpu, need_tlb_flush);
+}
+
+unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, unsigned long msr)
+{
+	unsigned long msr_needed = 0;
+
+	msr &= ~MSR_EE;
+
+	/* MSR bits may have been cleared by context switch so must recheck */
+	if (IS_ENABLED(CONFIG_PPC_FPU))
+		msr_needed |= MSR_FP;
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		msr_needed |= MSR_VEC;
+	if (cpu_has_feature(CPU_FTR_VSX))
+		msr_needed |= MSR_VSX;
+	if ((cpu_has_feature(CPU_FTR_TM) ||
+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
+			(vcpu->arch.hfscr & HFSCR_TM))
+		msr_needed |= MSR_TM;
+
+	/*
+	 * This could be combined with MSR[RI] clearing, but that expands
+	 * the unrecoverable window. It would be better to cover unrecoverable
+	 * with KVM bad interrupt handling rather than use MSR[RI] at all.
+	 *
+	 * Much more difficult and less worthwhile to combine with IR/DR
+	 * disable.
+	 */
+	if ((msr & msr_needed) != msr_needed) {
+		msr |= msr_needed;
+		__mtmsrd(msr, 0);
+	} else {
+		__hard_irq_disable();
+	}
+	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+	return msr;
+}
+EXPORT_SYMBOL_GPL(kvmppc_msr_hard_disable_set_facilities);
+
+int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
+{
+	struct p9_host_os_sprs host_os_sprs;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	s64 hdec, dec;
+	u64 purr, spurr;
+	u64 *exsave;
+	int trap;
+	unsigned long msr;
+	unsigned long host_hfscr;
+	unsigned long host_ciabr;
+	unsigned long host_dawr0;
+	unsigned long host_dawrx0;
+	unsigned long host_psscr;
+	unsigned long host_hpsscr;
+	unsigned long host_pidr;
+	unsigned long host_dawr1;
+	unsigned long host_dawrx1;
+	unsigned long dpdes;
+
+	hdec = time_limit - *tb;
+	if (hdec < 0)
+		return BOOK3S_INTERRUPT_HV_DECREMENTER;
+
+	WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV);
+	WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME));
+
+	vcpu->arch.ceded = 0;
+
+	/* Save MSR for restore, with EE clear. */
+	msr = mfmsr() & ~MSR_EE;
+
+	host_hfscr = mfspr(SPRN_HFSCR);
+	host_ciabr = mfspr(SPRN_CIABR);
+	host_psscr = mfspr(SPRN_PSSCR_PR);
+	if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+		host_hpsscr = mfspr(SPRN_PSSCR);
+	host_pidr = mfspr(SPRN_PID);
+
+	if (dawr_enabled()) {
+		host_dawr0 = mfspr(SPRN_DAWR0);
+		host_dawrx0 = mfspr(SPRN_DAWRX0);
+		if (cpu_has_feature(CPU_FTR_DAWR1)) {
+			host_dawr1 = mfspr(SPRN_DAWR1);
+			host_dawrx1 = mfspr(SPRN_DAWRX1);
+		}
+	}
+
+	local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
+	local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
+
+	save_p9_host_os_sprs(&host_os_sprs);
+
+	msr = kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
+	if (lazy_irq_pending()) {
+		trap = 0;
+		goto out;
+	}
+
+	if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
+		msr = mfmsr(); /* MSR may have been updated */
+
+	if (vc->tb_offset) {
+		u64 new_tb = *tb + vc->tb_offset;
+		mtspr(SPRN_TBU40, new_tb);
+		if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+			new_tb += 0x1000000;
+			mtspr(SPRN_TBU40, new_tb);
+		}
+		*tb = new_tb;
+		vc->tb_offset_applied = vc->tb_offset;
+	}
+
+	mtspr(SPRN_VTB, vc->vtb);
+	mtspr(SPRN_PURR, vcpu->arch.purr);
+	mtspr(SPRN_SPURR, vcpu->arch.spurr);
+
+	if (vc->pcr)
+		mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
+	if (vcpu->arch.doorbell_request) {
+		vcpu->arch.doorbell_request = 0;
+		mtspr(SPRN_DPDES, 1);
+	}
+
+	if (dawr_enabled()) {
+		if (vcpu->arch.dawr0 != host_dawr0)
+			mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
+		if (vcpu->arch.dawrx0 != host_dawrx0)
+			mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
+		if (cpu_has_feature(CPU_FTR_DAWR1)) {
+			if (vcpu->arch.dawr1 != host_dawr1)
+				mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
+			if (vcpu->arch.dawrx1 != host_dawrx1)
+				mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
+		}
+	}
+	if (vcpu->arch.ciabr != host_ciabr)
+		mtspr(SPRN_CIABR, vcpu->arch.ciabr);
+
+
+	if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+		mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+		      (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+	} else {
+		if (vcpu->arch.psscr != host_psscr)
+			mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+	}
+
+	mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
+
+	mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
+	mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME);
+
+	/*
+	 * On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage
+	 * Interrupt (HDSI) the HDSISR is not be updated at all.
+	 *
+	 * To work around this we put a canary value into the HDSISR before
+	 * returning to a guest and then check for this canary when we take a
+	 * HDSI. If we find the canary on a HDSI, we know the hardware didn't
+	 * update the HDSISR. In this case we return to the guest to retake the
+	 * HDSI which should correctly update the HDSISR the second time HDSI
+	 * entry.
+	 *
+	 * The "radix prefetch bug" test can be used to test for this bug, as
+	 * it also exists fo DD2.1 and below.
+	 */
+	if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+		mtspr(SPRN_HDSISR, HDSISR_CANARY);
+
+	mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
+	mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
+	mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
+	mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
+
+	/*
+	 * It might be preferable to load_vcpu_state here, in order to get the
+	 * GPR/FP register loads executing in parallel with the previous mtSPR
+	 * instructions, but for now that can't be done because the TM handling
+	 * in load_vcpu_state can change some SPRs and vcpu state (nip, msr).
+	 * But TM could be split out if this would be a significant benefit.
+	 */
+
+	/*
+	 * MSR[RI] does not need to be cleared (and is not, for radix guests
+	 * with no prefetch bug), because in_guest is set. If we take a SRESET
+	 * or MCE with in_guest set but still in HV mode, then
+	 * kvmppc_p9_bad_interrupt handles the interrupt, which effectively
+	 * clears MSR[RI] and doesn't return.
+	 */
+	WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_HV_P9);
+	barrier(); /* Open in_guest critical section */
+
+	/*
+	 * Hash host, hash guest, or radix guest with prefetch bug, all have
+	 * to disable the MMU before switching to guest MMU state.
+	 */
+	if (!radix_enabled() || !kvm_is_radix(kvm) ||
+			cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+		__mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0);
+
+	save_clear_host_mmu(kvm);
+
+	if (kvm_is_radix(kvm))
+		switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
+	else
+		switch_mmu_to_guest_hpt(kvm, vcpu, lpcr);
+
+	/* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */
+	check_need_tlb_flush(kvm, vc->pcpu, nested);
+
+	/*
+	 * P9 suppresses the HDEC exception when LPCR[HDICE] = 0,
+	 * so set guest LPCR (with HDICE) before writing HDEC.
+	 */
+	mtspr(SPRN_HDEC, hdec);
+
+	mtspr(SPRN_DEC, vcpu->arch.dec_expires - *tb);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+tm_return_to_guest:
+#endif
+	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+	mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
+	mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
+
+	switch_pmu_to_guest(vcpu, &host_os_sprs);
+	accumulate_time(vcpu, &vcpu->arch.in_guest);
+
+	kvmppc_p9_enter_guest(vcpu);
+
+	accumulate_time(vcpu, &vcpu->arch.guest_exit);
+	switch_pmu_to_host(vcpu, &host_os_sprs);
+
+	/* XXX: Could get these from r11/12 and paca exsave instead */
+	vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0);
+	vcpu->arch.shregs.srr1 = mfspr(SPRN_SRR1);
+	vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+	vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+
+	/* 0x2 bit for HSRR is only used by PR and P7/8 HV paths, clear it */
+	trap = local_paca->kvm_hstate.scratch0 & ~0x2;
+
+	if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK))
+		exsave = local_paca->exgen;
+	else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET)
+		exsave = local_paca->exnmi;
+	else /* trap == 0x200 */
+		exsave = local_paca->exmc;
+
+	vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1;
+	vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
+
+	/*
+	 * After reading machine check regs (DAR, DSISR, SRR0/1) and hstate
+	 * scratch (which we need to move into exsave to make re-entrant vs
+	 * SRESET/MCE), register state is protected from reentrancy. However
+	 * timebase, MMU, among other state is still set to guest, so don't
+	 * enable MSR[RI] here. It gets enabled at the end, after in_guest
+	 * is cleared.
+	 *
+	 * It is possible an NMI could come in here, which is why it is
+	 * important to save the above state early so it can be debugged.
+	 */
+
+	vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
+	vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
+	vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)];
+	vcpu->arch.regs.gpr[12] = exsave[EX_R12/sizeof(u64)];
+	vcpu->arch.regs.gpr[13] = exsave[EX_R13/sizeof(u64)];
+	vcpu->arch.ppr = exsave[EX_PPR/sizeof(u64)];
+	vcpu->arch.cfar = exsave[EX_CFAR/sizeof(u64)];
+	vcpu->arch.regs.ctr = exsave[EX_CTR/sizeof(u64)];
+
+	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
+
+	if (unlikely(trap == BOOK3S_INTERRUPT_MACHINE_CHECK)) {
+		vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
+		vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
+		kvmppc_realmode_machine_check(vcpu);
+
+	} else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
+		kvmppc_p9_realmode_hmi_handler(vcpu);
+
+	} else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
+		vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
+
+	} else if (trap == BOOK3S_INTERRUPT_H_DATA_STORAGE) {
+		vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
+		vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
+		vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
+
+	} else if (trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+		vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
+
+	} else if (trap == BOOK3S_INTERRUPT_H_FAC_UNAVAIL) {
+		vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * Softpatch interrupt for transactional memory emulation cases
+	 * on POWER9 DD2.2.  This is early in the guest exit path - we
+	 * haven't saved registers or done a treclaim yet.
+	 */
+	} else if (trap == BOOK3S_INTERRUPT_HV_SOFTPATCH) {
+		vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
+
+		/*
+		 * The cases we want to handle here are those where the guest
+		 * is in real suspend mode and is trying to transition to
+		 * transactional mode.
+		 */
+		if (!local_paca->kvm_hstate.fake_suspend &&
+				(vcpu->arch.shregs.msr & MSR_TS_S)) {
+			if (kvmhv_p9_tm_emulation_early(vcpu)) {
+				/*
+				 * Go straight back into the guest with the
+				 * new NIP/MSR as set by TM emulation.
+				 */
+				mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
+				mtspr(SPRN_HSRR1, vcpu->arch.shregs.msr);
+				goto tm_return_to_guest;
+			}
+		}
+#endif
+	}
+
+	/* Advance host PURR/SPURR by the amount used by guest */
+	purr = mfspr(SPRN_PURR);
+	spurr = mfspr(SPRN_SPURR);
+	local_paca->kvm_hstate.host_purr += purr - vcpu->arch.purr;
+	local_paca->kvm_hstate.host_spurr += spurr - vcpu->arch.spurr;
+	vcpu->arch.purr = purr;
+	vcpu->arch.spurr = spurr;
+
+	vcpu->arch.ic = mfspr(SPRN_IC);
+	vcpu->arch.pid = mfspr(SPRN_PID);
+	vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
+
+	vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
+	vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
+	vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
+	vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+
+	dpdes = mfspr(SPRN_DPDES);
+	if (dpdes)
+		vcpu->arch.doorbell_request = 1;
+
+	vc->vtb = mfspr(SPRN_VTB);
+
+	dec = mfspr(SPRN_DEC);
+	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
+		dec = (s32) dec;
+	*tb = mftb();
+	vcpu->arch.dec_expires = dec + *tb;
+
+	if (vc->tb_offset_applied) {
+		u64 new_tb = *tb - vc->tb_offset_applied;
+		mtspr(SPRN_TBU40, new_tb);
+		if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+			new_tb += 0x1000000;
+			mtspr(SPRN_TBU40, new_tb);
+		}
+		*tb = new_tb;
+		vc->tb_offset_applied = 0;
+	}
+
+	save_clear_guest_mmu(kvm, vcpu);
+	switch_mmu_to_host(kvm, host_pidr);
+
+	/*
+	 * Enable MSR here in order to have facilities enabled to save
+	 * guest registers. This enables MMU (if we were in realmode), so
+	 * only switch MMU on after the MMU is switched to host, to avoid
+	 * the P9_RADIX_PREFETCH_BUG or hash guest context.
+	 */
+	if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
+			vcpu->arch.shregs.msr & MSR_TS_MASK)
+		msr |= MSR_TS_S;
+	__mtmsrd(msr, 0);
+
+	store_vcpu_state(vcpu);
+
+	mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr);
+	mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr);
+
+	if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+		/* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+		mtspr(SPRN_PSSCR, host_hpsscr |
+		      (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+	}
+
+	mtspr(SPRN_HFSCR, host_hfscr);
+	if (vcpu->arch.ciabr != host_ciabr)
+		mtspr(SPRN_CIABR, host_ciabr);
+
+	if (dawr_enabled()) {
+		if (vcpu->arch.dawr0 != host_dawr0)
+			mtspr(SPRN_DAWR0, host_dawr0);
+		if (vcpu->arch.dawrx0 != host_dawrx0)
+			mtspr(SPRN_DAWRX0, host_dawrx0);
+		if (cpu_has_feature(CPU_FTR_DAWR1)) {
+			if (vcpu->arch.dawr1 != host_dawr1)
+				mtspr(SPRN_DAWR1, host_dawr1);
+			if (vcpu->arch.dawrx1 != host_dawrx1)
+				mtspr(SPRN_DAWRX1, host_dawrx1);
+		}
+	}
+
+	if (dpdes)
+		mtspr(SPRN_DPDES, 0);
+	if (vc->pcr)
+		mtspr(SPRN_PCR, PCR_MASK);
+
+	/* HDEC must be at least as large as DEC, so decrementer_max fits */
+	mtspr(SPRN_HDEC, decrementer_max);
+
+	timer_rearm_host_dec(*tb);
+
+	restore_p9_host_os_sprs(vcpu, &host_os_sprs);
+
+	barrier(); /* Close in_guest critical section */
+	WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_NONE);
+	/* Interrupts are recoverable at this point */
+
+	/*
+	 * cp_abort is required if the processor supports local copy-paste
+	 * to clear the copy buffer that was under control of the guest.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		asm volatile(PPC_CP_ABORT);
+
+out:
+	return trap;
+}
+EXPORT_SYMBOL_GPL(kvmhv_vcpu_entry_p9);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_perf.c b/arch/powerpc/kvm/book3s_hv_p9_perf.c
new file mode 100644
index 0000000000..44d24cca3d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_p9_perf.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/kvm_ppc.h>
+#include <asm/pmc.h>
+
+#include "book3s_hv.h"
+
+static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra)
+{
+	if (!(mmcr0 & MMCR0_FC))
+		goto do_freeze;
+	if (mmcra & MMCRA_SAMPLE_ENABLE)
+		goto do_freeze;
+	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+		if (!(mmcr0 & MMCR0_PMCCEXT))
+			goto do_freeze;
+		if (!(mmcra & MMCRA_BHRB_DISABLE))
+			goto do_freeze;
+	}
+	return;
+
+do_freeze:
+	mmcr0 = MMCR0_FC;
+	mmcra = 0;
+	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+		mmcr0 |= MMCR0_PMCCEXT;
+		mmcra = MMCRA_BHRB_DISABLE;
+	}
+
+	mtspr(SPRN_MMCR0, mmcr0);
+	mtspr(SPRN_MMCRA, mmcra);
+	isync();
+}
+
+void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
+			 struct p9_host_os_sprs *host_os_sprs)
+{
+	struct lppaca *lp;
+	int load_pmu = 1;
+
+	lp = vcpu->arch.vpa.pinned_addr;
+	if (lp)
+		load_pmu = lp->pmcregs_in_use;
+
+	/* Save host */
+	if (ppc_get_pmu_inuse()) {
+		/* POWER9, POWER10 do not implement HPMC or SPMC */
+
+		host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0);
+		host_os_sprs->mmcra = mfspr(SPRN_MMCRA);
+
+		freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra);
+
+		host_os_sprs->pmc1 = mfspr(SPRN_PMC1);
+		host_os_sprs->pmc2 = mfspr(SPRN_PMC2);
+		host_os_sprs->pmc3 = mfspr(SPRN_PMC3);
+		host_os_sprs->pmc4 = mfspr(SPRN_PMC4);
+		host_os_sprs->pmc5 = mfspr(SPRN_PMC5);
+		host_os_sprs->pmc6 = mfspr(SPRN_PMC6);
+		host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1);
+		host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2);
+		host_os_sprs->sdar = mfspr(SPRN_SDAR);
+		host_os_sprs->siar = mfspr(SPRN_SIAR);
+		host_os_sprs->sier1 = mfspr(SPRN_SIER);
+
+		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+			host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3);
+			host_os_sprs->sier2 = mfspr(SPRN_SIER2);
+			host_os_sprs->sier3 = mfspr(SPRN_SIER3);
+		}
+	}
+
+#ifdef CONFIG_PPC_PSERIES
+	/* After saving PMU, before loading guest PMU, flip pmcregs_in_use */
+	if (kvmhv_on_pseries()) {
+		barrier();
+		get_lppaca()->pmcregs_in_use = load_pmu;
+		barrier();
+	}
+#endif
+
+	/*
+	 * Load guest. If the VPA said the PMCs are not in use but the guest
+	 * tried to access them anyway, HFSCR[PM] will be set by the HFAC
+	 * fault so we can make forward progress.
+	 */
+	if (load_pmu || (vcpu->arch.hfscr & HFSCR_PM)) {
+		mtspr(SPRN_PMC1, vcpu->arch.pmc[0]);
+		mtspr(SPRN_PMC2, vcpu->arch.pmc[1]);
+		mtspr(SPRN_PMC3, vcpu->arch.pmc[2]);
+		mtspr(SPRN_PMC4, vcpu->arch.pmc[3]);
+		mtspr(SPRN_PMC5, vcpu->arch.pmc[4]);
+		mtspr(SPRN_PMC6, vcpu->arch.pmc[5]);
+		mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]);
+		mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]);
+		mtspr(SPRN_SDAR, vcpu->arch.sdar);
+		mtspr(SPRN_SIAR, vcpu->arch.siar);
+		mtspr(SPRN_SIER, vcpu->arch.sier[0]);
+
+		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+			mtspr(SPRN_MMCR3, vcpu->arch.mmcr[3]);
+			mtspr(SPRN_SIER2, vcpu->arch.sier[1]);
+			mtspr(SPRN_SIER3, vcpu->arch.sier[2]);
+		}
+
+		/* Set MMCRA then MMCR0 last */
+		mtspr(SPRN_MMCRA, vcpu->arch.mmcra);
+		mtspr(SPRN_MMCR0, vcpu->arch.mmcr[0]);
+		/* No isync necessary because we're starting counters */
+
+		if (!vcpu->arch.nested &&
+		    (vcpu->arch.hfscr_permitted & HFSCR_PM))
+			vcpu->arch.hfscr |= HFSCR_PM;
+	}
+}
+EXPORT_SYMBOL_GPL(switch_pmu_to_guest);
+
+void switch_pmu_to_host(struct kvm_vcpu *vcpu,
+			struct p9_host_os_sprs *host_os_sprs)
+{
+	struct lppaca *lp;
+	int save_pmu = 1;
+
+	lp = vcpu->arch.vpa.pinned_addr;
+	if (lp)
+		save_pmu = lp->pmcregs_in_use;
+	if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) {
+		/*
+		 * Save pmu if this guest is capable of running nested guests.
+		 * This is option is for old L1s that do not set their
+		 * lppaca->pmcregs_in_use properly when entering their L2.
+		 */
+		save_pmu |= nesting_enabled(vcpu->kvm);
+	}
+
+	if (save_pmu) {
+		vcpu->arch.mmcr[0] = mfspr(SPRN_MMCR0);
+		vcpu->arch.mmcra = mfspr(SPRN_MMCRA);
+
+		freeze_pmu(vcpu->arch.mmcr[0], vcpu->arch.mmcra);
+
+		vcpu->arch.pmc[0] = mfspr(SPRN_PMC1);
+		vcpu->arch.pmc[1] = mfspr(SPRN_PMC2);
+		vcpu->arch.pmc[2] = mfspr(SPRN_PMC3);
+		vcpu->arch.pmc[3] = mfspr(SPRN_PMC4);
+		vcpu->arch.pmc[4] = mfspr(SPRN_PMC5);
+		vcpu->arch.pmc[5] = mfspr(SPRN_PMC6);
+		vcpu->arch.mmcr[1] = mfspr(SPRN_MMCR1);
+		vcpu->arch.mmcr[2] = mfspr(SPRN_MMCR2);
+		vcpu->arch.sdar = mfspr(SPRN_SDAR);
+		vcpu->arch.siar = mfspr(SPRN_SIAR);
+		vcpu->arch.sier[0] = mfspr(SPRN_SIER);
+
+		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+			vcpu->arch.mmcr[3] = mfspr(SPRN_MMCR3);
+			vcpu->arch.sier[1] = mfspr(SPRN_SIER2);
+			vcpu->arch.sier[2] = mfspr(SPRN_SIER3);
+		}
+
+	} else if (vcpu->arch.hfscr & HFSCR_PM) {
+		/*
+		 * The guest accessed PMC SPRs without specifying they should
+		 * be preserved, or it cleared pmcregs_in_use after the last
+		 * access. Just ensure they are frozen.
+		 */
+		freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA));
+
+		/*
+		 * Demand-fault PMU register access in the guest.
+		 *
+		 * This is used to grab the guest's VPA pmcregs_in_use value
+		 * and reflect it into the host's VPA in the case of a nested
+		 * hypervisor.
+		 *
+		 * It also avoids having to zero-out SPRs after each guest
+		 * exit to avoid side-channels when.
+		 *
+		 * This is cleared here when we exit the guest, so later HFSCR
+		 * interrupt handling can add it back to run the guest with
+		 * PM enabled next time.
+		 */
+		if (!vcpu->arch.nested)
+			vcpu->arch.hfscr &= ~HFSCR_PM;
+	} /* otherwise the PMU should still be frozen */
+
+#ifdef CONFIG_PPC_PSERIES
+	if (kvmhv_on_pseries()) {
+		barrier();
+		get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
+		barrier();
+	}
+#endif
+
+	if (ppc_get_pmu_inuse()) {
+		mtspr(SPRN_PMC1, host_os_sprs->pmc1);
+		mtspr(SPRN_PMC2, host_os_sprs->pmc2);
+		mtspr(SPRN_PMC3, host_os_sprs->pmc3);
+		mtspr(SPRN_PMC4, host_os_sprs->pmc4);
+		mtspr(SPRN_PMC5, host_os_sprs->pmc5);
+		mtspr(SPRN_PMC6, host_os_sprs->pmc6);
+		mtspr(SPRN_MMCR1, host_os_sprs->mmcr1);
+		mtspr(SPRN_MMCR2, host_os_sprs->mmcr2);
+		mtspr(SPRN_SDAR, host_os_sprs->sdar);
+		mtspr(SPRN_SIAR, host_os_sprs->siar);
+		mtspr(SPRN_SIER, host_os_sprs->sier1);
+
+		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+			mtspr(SPRN_MMCR3, host_os_sprs->mmcr3);
+			mtspr(SPRN_SIER2, host_os_sprs->sier2);
+			mtspr(SPRN_SIER3, host_os_sprs->sier3);
+		}
+
+		/* Set MMCRA then MMCR0 last */
+		mtspr(SPRN_MMCRA, host_os_sprs->mmcra);
+		mtspr(SPRN_MMCR0, host_os_sprs->mmcr0);
+		isync();
+	}
+}
+EXPORT_SYMBOL_GPL(switch_pmu_to_host);
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 0000000000..82be6d8751
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <asm/lppaca.h>
+#include <asm/opal.h>
+#include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/hmi.h>
+#include <asm/kvm_ppc.h>
+
+/* SRR1 bits for machine check on POWER7 */
+#define SRR1_MC_LDSTERR		(1ul << (63-42))
+#define SRR1_MC_IFETCH_SH	(63-45)
+#define SRR1_MC_IFETCH_MASK	0x7
+#define SRR1_MC_IFETCH_SLBPAR		2	/* SLB parity error */
+#define SRR1_MC_IFETCH_SLBMULTI		3	/* SLB multi-hit */
+#define SRR1_MC_IFETCH_SLBPARMULTI	4	/* SLB parity + multi-hit */
+#define SRR1_MC_IFETCH_TLBMULTI		5	/* I-TLB multi-hit */
+
+/* DSISR bits for machine check on POWER7 */
+#define DSISR_MC_DERAT_MULTI	0x800		/* D-ERAT multi-hit */
+#define DSISR_MC_TLB_MULTI	0x400		/* D-TLB multi-hit */
+#define DSISR_MC_SLB_PARITY	0x100		/* SLB parity error */
+#define DSISR_MC_SLB_MULTI	0x080		/* SLB multi-hit */
+#define DSISR_MC_SLB_PARMULTI	0x040		/* SLB parity + multi-hit */
+
+/* POWER7 SLB flush and reload */
+static void reload_slb(struct kvm_vcpu *vcpu)
+{
+	struct slb_shadow *slb;
+	unsigned long i, n;
+
+	/* First clear out SLB */
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+	/* Do they have an SLB shadow buffer registered? */
+	slb = vcpu->arch.slb_shadow.pinned_addr;
+	if (!slb)
+		return;
+
+	/* Sanity check */
+	n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
+	if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
+		return;
+
+	/* Load up the SLB from that */
+	for (i = 0; i < n; ++i) {
+		unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
+		unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
+
+		rb = (rb & ~0xFFFul) | i;	/* insert entry number */
+		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+	}
+}
+
+/*
+ * On POWER7, see if we can handle a machine check that occurred inside
+ * the guest in real mode, without switching to the host partition.
+ */
+static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+{
+	unsigned long srr1 = vcpu->arch.shregs.msr;
+	long handled = 1;
+
+	if (srr1 & SRR1_MC_LDSTERR) {
+		/* error on load/store */
+		unsigned long dsisr = vcpu->arch.shregs.dsisr;
+
+		if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+			     DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) {
+			/* flush and reload SLB; flushes D-ERAT too */
+			reload_slb(vcpu);
+			dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+				   DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
+		}
+		if (dsisr & DSISR_MC_TLB_MULTI) {
+			tlbiel_all_lpid(vcpu->kvm->arch.radix);
+			dsisr &= ~DSISR_MC_TLB_MULTI;
+		}
+		/* Any other errors we don't understand? */
+		if (dsisr & 0xffffffffUL)
+			handled = 0;
+	}
+
+	switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) {
+	case 0:
+		break;
+	case SRR1_MC_IFETCH_SLBPAR:
+	case SRR1_MC_IFETCH_SLBMULTI:
+	case SRR1_MC_IFETCH_SLBPARMULTI:
+		reload_slb(vcpu);
+		break;
+	case SRR1_MC_IFETCH_TLBMULTI:
+		tlbiel_all_lpid(vcpu->kvm->arch.radix);
+		break;
+	default:
+		handled = 0;
+	}
+
+	return handled;
+}
+
+void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
+{
+	struct machine_check_event mce_evt;
+	long handled;
+
+	if (vcpu->kvm->arch.fwnmi_enabled) {
+		/* FWNMI guests handle their own recovery */
+		handled = 0;
+	} else {
+		handled = kvmppc_realmode_mc_power7(vcpu);
+	}
+
+	/*
+	 * Now get the event and stash it in the vcpu struct so it can
+	 * be handled by the primary thread in virtual mode.  We can't
+	 * call machine_check_queue_event() here if we are running on
+	 * an offline secondary thread.
+	 */
+	if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
+		if (handled && mce_evt.version == MCE_V1)
+			mce_evt.disposition = MCE_DISPOSITION_RECOVERED;
+	} else {
+		memset(&mce_evt, 0, sizeof(mce_evt));
+	}
+
+	vcpu->arch.mce_evt = mce_evt;
+}
+
+
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	long ret = 0;
+
+	/*
+	 * Unapply and clear the offset first. That way, if the TB was not
+	 * resynced then it will remain in host-offset, and if it was resynced
+	 * then it is brought into host-offset. Then the tb offset is
+	 * re-applied before continuing with the KVM exit.
+	 *
+	 * This way, we don't need to actually know whether not OPAL resynced
+	 * the timebase or do any of the complicated dance that the P7/8
+	 * path requires.
+	 */
+	if (vc->tb_offset_applied) {
+		u64 new_tb = mftb() - vc->tb_offset_applied;
+		mtspr(SPRN_TBU40, new_tb);
+		if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+			new_tb += 0x1000000;
+			mtspr(SPRN_TBU40, new_tb);
+		}
+		vc->tb_offset_applied = 0;
+	}
+
+	local_paca->hmi_irqs++;
+
+	if (hmi_handle_debugtrig(NULL) >= 0) {
+		ret = 1;
+		goto out;
+	}
+
+	if (ppc_md.hmi_exception_early)
+		ppc_md.hmi_exception_early(NULL);
+
+out:
+	if (vc->tb_offset) {
+		u64 new_tb = mftb() + vc->tb_offset;
+		mtspr(SPRN_TBU40, new_tb);
+		if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+			new_tb += 0x1000000;
+			mtspr(SPRN_TBU40, new_tb);
+		}
+		vc->tb_offset_applied = vc->tb_offset;
+	}
+
+	return ret;
+}
+
+/*
+ * The following subcore HMI handling is all only for pre-POWER9 CPUs.
+ */
+
+/* Check if dynamic split is in force and return subcore size accordingly. */
+static inline int kvmppc_cur_subcore_size(void)
+{
+	if (local_paca->kvm_hstate.kvm_split_mode)
+		return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
+
+	return threads_per_subcore;
+}
+
+void kvmppc_subcore_enter_guest(void)
+{
+	int thread_id, subcore_id;
+
+	thread_id = cpu_thread_in_core(local_paca->paca_index);
+	subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+	local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
+
+void kvmppc_subcore_exit_guest(void)
+{
+	int thread_id, subcore_id;
+
+	thread_id = cpu_thread_in_core(local_paca->paca_index);
+	subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+	local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
+
+static bool kvmppc_tb_resync_required(void)
+{
+	if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
+				&local_paca->sibling_subcore_state->flags))
+		return false;
+
+	return true;
+}
+
+static void kvmppc_tb_resync_done(void)
+{
+	clear_bit(CORE_TB_RESYNC_REQ_BIT,
+			&local_paca->sibling_subcore_state->flags);
+}
+
+/*
+ * kvmppc_realmode_hmi_handler() is called only by primary thread during
+ * guest exit path.
+ *
+ * There are multiple reasons why HMI could occur, one of them is
+ * Timebase (TB) error. If this HMI is due to TB error, then TB would
+ * have been in stopped state. The opal hmi handler Will fix it and
+ * restore the TB value with host timebase value. For HMI caused due
+ * to non-TB errors, opal hmi handler will not touch/restore TB register
+ * and hence there won't be any change in TB value.
+ *
+ * Since we are not sure about the cause of this HMI, we can't be sure
+ * about the content of TB register whether it holds guest or host timebase
+ * value. Hence the idea is to resync the TB on every HMI, so that we
+ * know about the exact state of the TB value. Resync TB call will
+ * restore TB to host timebase.
+ *
+ * Things to consider:
+ * - On TB error, HMI interrupt is reported on all the threads of the core
+ *   that has encountered TB error irrespective of split-core mode.
+ * - The very first thread on the core that get chance to fix TB error
+ *   would rsync the TB with local chipTOD value.
+ * - The resync TB is a core level action i.e. it will sync all the TBs
+ *   in that core independent of split-core mode. This means if we trigger
+ *   TB sync from a thread from one subcore, it would affect TB values of
+ *   sibling subcores of the same core.
+ *
+ * All threads need to co-ordinate before making opal hmi handler.
+ * All threads will use sibling_subcore_state->in_guest[] (shared by all
+ * threads in the core) in paca which holds information about whether
+ * sibling subcores are in Guest mode or host mode. The in_guest[] array
+ * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
+ * subcore status. Only primary threads from each subcore is responsible
+ * to set/unset its designated array element while entering/exiting the
+ * guset.
+ *
+ * After invoking opal hmi handler call, one of the thread (of entire core)
+ * will need to resync the TB. Bit 63 from subcore state bitmap flags
+ * (sibling_subcore_state->flags) will be used to co-ordinate between
+ * primary threads to decide who takes up the responsibility.
+ *
+ * This is what we do:
+ * - Primary thread from each subcore tries to set resync required bit[63]
+ *   of paca->sibling_subcore_state->flags.
+ * - The first primary thread that is able to set the flag takes the
+ *   responsibility of TB resync. (Let us call it as thread leader)
+ * - All other threads which are in host will call
+ *   wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
+ *   paca->sibling_subcore_state to get cleared.
+ * - All the primary thread will clear its subcore status from subcore
+ *   state in_guest[] array respectively.
+ * - Once all primary threads clear in_guest[0-3], all of them will invoke
+ *   opal hmi handler.
+ * - Now all threads will wait for TB resync to complete by invoking
+ *   wait_for_tb_resync() except the thread leader.
+ * - Thread leader will do a TB resync by invoking opal_resync_timebase()
+ *   call and the it will clear the resync required bit.
+ * - All other threads will now come out of resync wait loop and proceed
+ *   with individual execution.
+ * - On return of this function, primary thread will signal all
+ *   secondary threads to proceed.
+ * - All secondary threads will eventually call opal hmi handler on
+ *   their exit path.
+ *
+ * Returns 1 if the timebase offset should be applied, 0 if not.
+ */
+
+long kvmppc_realmode_hmi_handler(void)
+{
+	bool resync_req;
+
+	local_paca->hmi_irqs++;
+
+	if (hmi_handle_debugtrig(NULL) >= 0)
+		return 1;
+
+	/*
+	 * By now primary thread has already completed guest->host
+	 * partition switch but haven't signaled secondaries yet.
+	 * All the secondary threads on this subcore is waiting
+	 * for primary thread to signal them to go ahead.
+	 *
+	 * For threads from subcore which isn't in guest, they all will
+	 * wait until all other subcores on this core exit the guest.
+	 *
+	 * Now set the resync required bit. If you are the first to
+	 * set this bit then kvmppc_tb_resync_required() function will
+	 * return true. For rest all other subcores
+	 * kvmppc_tb_resync_required() will return false.
+	 *
+	 * If resync_req == true, then this thread is responsible to
+	 * initiate TB resync after hmi handler has completed.
+	 * All other threads on this core will wait until this thread
+	 * clears the resync required bit flag.
+	 */
+	resync_req = kvmppc_tb_resync_required();
+
+	/* Reset the subcore status to indicate it has exited guest */
+	kvmppc_subcore_exit_guest();
+
+	/*
+	 * Wait for other subcores on this core to exit the guest.
+	 * All the primary threads and threads from subcore that are
+	 * not in guest will wait here until all subcores are out
+	 * of guest context.
+	 */
+	wait_for_subcore_guest_exit();
+
+	/*
+	 * At this point we are sure that primary threads from each
+	 * subcore on this core have completed guest->host partition
+	 * switch. Now it is safe to call HMI handler.
+	 */
+	if (ppc_md.hmi_exception_early)
+		ppc_md.hmi_exception_early(NULL);
+
+	/*
+	 * Check if this thread is responsible to resync TB.
+	 * All other threads will wait until this thread completes the
+	 * TB resync.
+	 */
+	if (resync_req) {
+		opal_resync_timebase();
+		/* Reset TB resync req bit */
+		kvmppc_tb_resync_done();
+	} else {
+		wait_for_tb_resync();
+	}
+
+	/*
+	 * Reset tb_offset_applied so the guest exit code won't try
+	 * to subtract the previous timebase offset from the timebase.
+	 */
+	if (local_paca->kvm_hstate.kvm_vcore)
+		local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
+
+	return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644
index 0000000000..9182324dbe
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -0,0 +1,1300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
+#include <linux/module.h>
+#include <linux/log2.h>
+#include <linux/sizes.h>
+
+#include <asm/trace.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/book3s/64/mmu-hash.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/pte-walk.h>
+
+/* Translate address of a vmalloc'd thing to a linear map address */
+static void *real_vmalloc_addr(void *addr)
+{
+	return __va(ppc_find_vmap_phys((unsigned long)addr));
+}
+
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm)
+{
+	int global;
+	int cpu;
+
+	/*
+	 * If there is only one vcore, and it's currently running,
+	 * as indicated by local_paca->kvm_hstate.kvm_vcpu being set,
+	 * we can use tlbiel as long as we mark all other physical
+	 * cores as potentially having stale TLB entries for this lpid.
+	 * Otherwise, don't use tlbiel.
+	 */
+	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu)
+		global = 0;
+	else
+		global = 1;
+
+	/* LPID has been switched to host if in virt mode so can't do local */
+	if (!global && (mfmsr() & (MSR_IR|MSR_DR)))
+		global = 1;
+
+	if (!global) {
+		/* any other core might now have stale TLB entries... */
+		smp_wmb();
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+		cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
+		cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
+	}
+
+	return global;
+}
+
+/*
+ * Add this HPTE into the chain for the real page.
+ * Must be called with the chain locked; it unlocks the chain.
+ */
+void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
+			     unsigned long *rmap, long pte_index, int realmode)
+{
+	struct revmap_entry *head, *tail;
+	unsigned long i;
+
+	if (*rmap & KVMPPC_RMAP_PRESENT) {
+		i = *rmap & KVMPPC_RMAP_INDEX;
+		head = &kvm->arch.hpt.rev[i];
+		if (realmode)
+			head = real_vmalloc_addr(head);
+		tail = &kvm->arch.hpt.rev[head->back];
+		if (realmode)
+			tail = real_vmalloc_addr(tail);
+		rev->forw = i;
+		rev->back = head->back;
+		tail->forw = pte_index;
+		head->back = pte_index;
+	} else {
+		rev->forw = rev->back = pte_index;
+		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
+			pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT;
+	}
+	unlock_rmap(rmap);
+}
+EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
+
+/* Update the dirty bitmap of a memslot */
+void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot,
+			     unsigned long gfn, unsigned long psize)
+{
+	unsigned long npages;
+
+	if (!psize || !memslot->dirty_bitmap)
+		return;
+	npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE;
+	gfn -= memslot->base_gfn;
+	set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages);
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map);
+
+static void kvmppc_set_dirty_from_hpte(struct kvm *kvm,
+				unsigned long hpte_v, unsigned long hpte_gr)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
+	unsigned long psize;
+
+	psize = kvmppc_actual_pgsz(hpte_v, hpte_gr);
+	gfn = hpte_rpn(hpte_gr, psize);
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (memslot && memslot->dirty_bitmap)
+		kvmppc_update_dirty_map(memslot, gfn, psize);
+}
+
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+				      unsigned long hpte_gr,
+				      struct kvm_memory_slot **memslotp,
+				      unsigned long *gfnp)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	unsigned long gfn;
+
+	gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (memslotp)
+		*memslotp = memslot;
+	if (gfnp)
+		*gfnp = gfn;
+	if (!memslot)
+		return NULL;
+
+	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
+	return rmap;
+}
+
+/* Remove this HPTE from the chain for a real page */
+static void remove_revmap_chain(struct kvm *kvm, long pte_index,
+				struct revmap_entry *rev,
+				unsigned long hpte_v, unsigned long hpte_r)
+{
+	struct revmap_entry *next, *prev;
+	unsigned long ptel, head;
+	unsigned long *rmap;
+	unsigned long rcbits;
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
+
+	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
+	ptel = rev->guest_rpte |= rcbits;
+	rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn);
+	if (!rmap)
+		return;
+	lock_rmap(rmap);
+
+	head = *rmap & KVMPPC_RMAP_INDEX;
+	next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]);
+	prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]);
+	next->back = rev->back;
+	prev->forw = rev->forw;
+	if (head == pte_index) {
+		head = rev->forw;
+		if (head == pte_index)
+			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+		else
+			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
+	}
+	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+	if (rcbits & HPTE_R_C)
+		kvmppc_update_dirty_map(memslot, gfn,
+					kvmppc_actual_pgsz(hpte_v, hpte_r));
+	unlock_rmap(rmap);
+}
+
+long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+		       long pte_index, unsigned long pteh, unsigned long ptel,
+		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
+{
+	unsigned long i, pa, gpa, gfn, psize;
+	unsigned long slot_fn, hva;
+	__be64 *hpte;
+	struct revmap_entry *rev;
+	unsigned long g_ptel;
+	struct kvm_memory_slot *memslot;
+	unsigned hpage_shift;
+	bool is_ci;
+	unsigned long *rmap;
+	pte_t *ptep;
+	unsigned int writing;
+	unsigned long mmu_seq;
+	unsigned long rcbits;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	/*
+	 * The HPTE gets used by compute_tlbie_rb() to set TLBIE bits, so
+	 * these functions should work together -- must ensure a guest can not
+	 * cause problems with the TLBIE that KVM executes.
+	 */
+	if ((pteh >> HPTE_V_SSIZE_SHIFT) & 0x2) {
+		/* B=0b1x is a reserved value, disallow it. */
+		return H_PARAMETER;
+	}
+	psize = kvmppc_actual_pgsz(pteh, ptel);
+	if (!psize)
+		return H_PARAMETER;
+	writing = hpte_is_writable(ptel);
+	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+	ptel &= ~HPTE_GR_RESERVED;
+	g_ptel = ptel;
+
+	/* used later to detect if we might have been invalidated */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	/* Find the memslot (if any) for this address */
+	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	pa = 0;
+	is_ci = false;
+	rmap = NULL;
+	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
+		/* Emulated MMIO - mark this with key=31 */
+		pteh |= HPTE_V_ABSENT;
+		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		goto do_insert;
+	}
+
+	/* Check if the requested page fits entirely in the memslot. */
+	if (!slot_is_aligned(memslot, psize))
+		return H_PARAMETER;
+	slot_fn = gfn - memslot->base_gfn;
+	rmap = &memslot->arch.rmap[slot_fn];
+
+	/* Translate to host virtual address */
+	hva = __gfn_to_hva_memslot(memslot, gfn);
+
+	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
+	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &hpage_shift);
+	if (ptep) {
+		pte_t pte;
+		unsigned int host_pte_size;
+
+		if (hpage_shift)
+			host_pte_size = 1ul << hpage_shift;
+		else
+			host_pte_size = PAGE_SIZE;
+		/*
+		 * We should always find the guest page size
+		 * to <= host page size, if host is using hugepage
+		 */
+		if (host_pte_size < psize) {
+			arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
+			return H_PARAMETER;
+		}
+		pte = kvmppc_read_update_linux_pte(ptep, writing);
+		if (pte_present(pte) && !pte_protnone(pte)) {
+			if (writing && !pte_write(pte))
+				/* make the actual HPTE be read-only */
+				ptel = hpte_make_readonly(ptel);
+			is_ci = pte_ci(pte);
+			pa = pte_pfn(pte) << PAGE_SHIFT;
+			pa |= hva & (host_pte_size - 1);
+			pa |= gpa & ~PAGE_MASK;
+		}
+	}
+	arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
+
+	ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1);
+	ptel |= pa;
+
+	if (pa)
+		pteh |= HPTE_V_VALID;
+	else {
+		pteh |= HPTE_V_ABSENT;
+		ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+	}
+
+	/*If we had host pte mapping then  Check WIMG */
+	if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) {
+		if (is_ci)
+			return H_PARAMETER;
+		/*
+		 * Allow guest to map emulated device memory as
+		 * uncacheable, but actually make it cacheable.
+		 */
+		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
+		ptel |= HPTE_R_M;
+	}
+
+	/* Find and lock the HPTEG slot to use */
+ do_insert:
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
+		return H_PARAMETER;
+	if (likely((flags & H_EXACT) == 0)) {
+		pte_index &= ~7UL;
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+		for (i = 0; i < 8; ++i) {
+			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
+			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+					  HPTE_V_ABSENT))
+				break;
+			hpte += 2;
+		}
+		if (i == 8) {
+			/*
+			 * Since try_lock_hpte doesn't retry (not even stdcx.
+			 * failures), it could be that there is a free slot
+			 * but we transiently failed to lock it.  Try again,
+			 * actually locking each slot and checking it.
+			 */
+			hpte -= 16;
+			for (i = 0; i < 8; ++i) {
+				u64 pte;
+				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+					cpu_relax();
+				pte = be64_to_cpu(hpte[0]);
+				if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
+					break;
+				__unlock_hpte(hpte, pte);
+				hpte += 2;
+			}
+			if (i == 8)
+				return H_PTEG_FULL;
+		}
+		pte_index += i;
+	} else {
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+				   HPTE_V_ABSENT)) {
+			/* Lock the slot and check again */
+			u64 pte;
+
+			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+				cpu_relax();
+			pte = be64_to_cpu(hpte[0]);
+			if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+				__unlock_hpte(hpte, pte);
+				return H_PTEG_FULL;
+			}
+		}
+	}
+
+	/* Save away the guest's idea of the second HPTE dword */
+	rev = &kvm->arch.hpt.rev[pte_index];
+	if (realmode)
+		rev = real_vmalloc_addr(rev);
+	if (rev) {
+		rev->guest_rpte = g_ptel;
+		note_hpte_modification(kvm, rev);
+	}
+
+	/* Link HPTE into reverse-map chain */
+	if (pteh & HPTE_V_VALID) {
+		if (realmode)
+			rmap = real_vmalloc_addr(rmap);
+		lock_rmap(rmap);
+		/* Check for pending invalidations under the rmap chain lock */
+		if (mmu_invalidate_retry(kvm, mmu_seq)) {
+			/* inval in progress, write a non-present HPTE */
+			pteh |= HPTE_V_ABSENT;
+			pteh &= ~HPTE_V_VALID;
+			ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+			unlock_rmap(rmap);
+		} else {
+			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
+						realmode);
+			/* Only set R/C in real HPTE if already set in *rmap */
+			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+		}
+	}
+
+	/* Convert to new format on P9 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		ptel = hpte_old_to_new_r(pteh, ptel);
+		pteh = hpte_old_to_new_v(pteh);
+	}
+	hpte[1] = cpu_to_be64(ptel);
+
+	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
+	eieio();
+	__unlock_hpte(hpte, pteh);
+	asm volatile("ptesync" : : : "memory");
+
+	*pte_idx_ret = pte_index;
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+		    long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
+				 vcpu->arch.pgdir, true,
+				 &vcpu->arch.regs.gpr[4]);
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_enter);
+
+#ifdef __BIG_ENDIAN__
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
+#else
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
+#endif
+
+static inline int is_mmio_hpte(unsigned long v, unsigned long r)
+{
+	return ((v & HPTE_V_ABSENT) &&
+		(r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+		(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
+}
+
+static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid)
+{
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		/* Radix flush for a hash guest */
+
+		unsigned long rb,rs,prs,r,ric;
+
+		rb = PPC_BIT(52); /* IS = 2 */
+		rs = 0;  /* lpid = 0 */
+		prs = 0; /* partition scoped */
+		r = 1;   /* radix format */
+		ric = 0; /* RIC_FLSUH_TLB */
+
+		/*
+		 * Need the extra ptesync to make sure we don't
+		 * re-order the tlbie
+		 */
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+			     : : "r"(rb), "i"(r), "i"(prs),
+			       "i"(ric), "r"(rs) : "memory");
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
+			     "r" (rb_value), "r" (lpid));
+	}
+}
+
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+		      long npages, int global, bool need_sync)
+{
+	long i;
+
+	/*
+	 * We use the POWER9 5-operand versions of tlbie and tlbiel here.
+	 * Since we are using RIC=0 PRS=0 R=0, and P7/P8 tlbiel ignores
+	 * the RS field, this is backwards-compatible with P7 and P8.
+	 */
+	if (global) {
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i) {
+			asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
+				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+		}
+
+		fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid);
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+	} else {
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i) {
+			asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : :
+				     "r" (rbvalues[i]), "r" (0));
+		}
+		asm volatile("ptesync" : : : "memory");
+	}
+}
+
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+			unsigned long pte_index, unsigned long avpn,
+			unsigned long *hpret)
+{
+	__be64 *hpte;
+	unsigned long v, r, rb;
+	struct revmap_entry *rev;
+	u64 pte, orig_pte, pte_r;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
+		return H_PARAMETER;
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	pte = orig_pte = be64_to_cpu(hpte[0]);
+	pte_r = be64_to_cpu(hpte[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		pte = hpte_new_to_old_v(pte, pte_r);
+		pte_r = hpte_new_to_old_r(pte_r);
+	}
+	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
+		__unlock_hpte(hpte, orig_pte);
+		return H_NOT_FOUND;
+	}
+
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	v = pte & ~HPTE_V_HVLOCK;
+	if (v & HPTE_V_VALID) {
+		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
+		rb = compute_tlbie_rb(v, pte_r, pte_index);
+		do_tlbies(kvm, &rb, 1, global_invalidates(kvm), true);
+		/*
+		 * The reference (R) and change (C) bits in a HPT
+		 * entry can be set by hardware at any time up until
+		 * the HPTE is invalidated and the TLB invalidation
+		 * sequence has completed.  This means that when
+		 * removing a HPTE, we need to re-read the HPTE after
+		 * the invalidation sequence has completed in order to
+		 * obtain reliable values of R and C.
+		 */
+		remove_revmap_chain(kvm, pte_index, rev, v,
+				    be64_to_cpu(hpte[1]));
+	}
+	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
+	note_hpte_modification(kvm, rev);
+	unlock_hpte(hpte, 0);
+
+	if (is_mmio_hpte(v, pte_r))
+		atomic64_inc(&kvm->arch.mmio_update);
+
+	if (v & HPTE_V_ABSENT)
+		v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+	hpret[0] = v;
+	hpret[1] = r;
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+		     unsigned long pte_index, unsigned long avpn)
+{
+	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
+				  &vcpu->arch.regs.gpr[4]);
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_remove);
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *args = &vcpu->arch.regs.gpr[4];
+	__be64 *hp, *hptes[4];
+	unsigned long tlbrb[4];
+	long int i, j, k, n, found, indexes[4];
+	unsigned long flags, req, pte_index, rcbits;
+	int global;
+	long int ret = H_SUCCESS;
+	struct revmap_entry *rev, *revs[4];
+	u64 hp0, hp1;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	global = global_invalidates(kvm);
+	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
+		n = 0;
+		for (; i < 4; ++i) {
+			j = i * 2;
+			pte_index = args[j];
+			flags = pte_index >> 56;
+			pte_index &= ((1ul << 56) - 1);
+			req = flags >> 6;
+			flags &= 3;
+			if (req == 3) {		/* no more requests */
+				i = 4;
+				break;
+			}
+			if (req != 1 || flags == 3 ||
+			    pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
+				/* parameter error */
+				args[j] = ((0xa0 | flags) << 56) + pte_index;
+				ret = H_PARAMETER;
+				break;
+			}
+			hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4));
+			/* to avoid deadlock, don't spin except for first */
+			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
+				if (n)
+					break;
+				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
+					cpu_relax();
+			}
+			found = 0;
+			hp0 = be64_to_cpu(hp[0]);
+			hp1 = be64_to_cpu(hp[1]);
+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+				hp0 = hpte_new_to_old_v(hp0, hp1);
+				hp1 = hpte_new_to_old_r(hp1);
+			}
+			if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
+				switch (flags & 3) {
+				case 0:		/* absolute */
+					found = 1;
+					break;
+				case 1:		/* andcond */
+					if (!(hp0 & args[j + 1]))
+						found = 1;
+					break;
+				case 2:		/* AVPN */
+					if ((hp0 & ~0x7fUL) == args[j + 1])
+						found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				hp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+				args[j] = ((0x90 | flags) << 56) + pte_index;
+				continue;
+			}
+
+			args[j] = ((0x80 | flags) << 56) + pte_index;
+			rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+			note_hpte_modification(kvm, rev);
+
+			if (!(hp0 & HPTE_V_VALID)) {
+				/* insert R and C bits from PTE */
+				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+				args[j] |= rcbits << (56 - 5);
+				hp[0] = 0;
+				if (is_mmio_hpte(hp0, hp1))
+					atomic64_inc(&kvm->arch.mmio_update);
+				continue;
+			}
+
+			/* leave it locked */
+			hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
+			tlbrb[n] = compute_tlbie_rb(hp0, hp1, pte_index);
+			indexes[n] = j;
+			hptes[n] = hp;
+			revs[n] = rev;
+			++n;
+		}
+
+		if (!n)
+			break;
+
+		/* Now that we've collected a batch, do the tlbies */
+		do_tlbies(kvm, tlbrb, n, global, true);
+
+		/* Read PTE low words after tlbie to get final R/C values */
+		for (k = 0; k < n; ++k) {
+			j = indexes[k];
+			pte_index = args[j] & ((1ul << 56) - 1);
+			hp = hptes[k];
+			rev = revs[k];
+			remove_revmap_chain(kvm, pte_index, rev,
+				be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
+			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+			args[j] |= rcbits << (56 - 5);
+			__unlock_hpte(hp, 0);
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_bulk_remove);
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+		      unsigned long pte_index, unsigned long avpn)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	struct revmap_entry *rev;
+	unsigned long v, r, rb, mask, bits;
+	u64 pte_v, pte_r;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
+		return H_PARAMETER;
+
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	v = pte_v = be64_to_cpu(hpte[0]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1]));
+	if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+	    ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) {
+		__unlock_hpte(hpte, pte_v);
+		return H_NOT_FOUND;
+	}
+
+	pte_r = be64_to_cpu(hpte[1]);
+	bits = (flags << 55) & HPTE_R_PP0;
+	bits |= (flags << 48) & HPTE_R_KEY_HI;
+	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	/* Update guest view of 2nd HPTE dword */
+	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	if (rev) {
+		r = (rev->guest_rpte & ~mask) | bits;
+		rev->guest_rpte = r;
+		note_hpte_modification(kvm, rev);
+	}
+
+	/* Update HPTE */
+	if (v & HPTE_V_VALID) {
+		/*
+		 * If the page is valid, don't let it transition from
+		 * readonly to writable.  If it should be writable, we'll
+		 * take a trap and let the page fault code sort it out.
+		 */
+		r = (pte_r & ~mask) | bits;
+		if (hpte_is_writable(r) && !hpte_is_writable(pte_r))
+			r = hpte_make_readonly(r);
+		/* If the PTE is changing, invalidate it first */
+		if (r != pte_r) {
+			rb = compute_tlbie_rb(v, r, pte_index);
+			hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) |
+					      HPTE_V_ABSENT);
+			do_tlbies(kvm, &rb, 1, global_invalidates(kvm), true);
+			/* Don't lose R/C bit updates done by hardware */
+			r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C);
+			hpte[1] = cpu_to_be64(r);
+		}
+	}
+	unlock_hpte(hpte, pte_v & ~HPTE_V_HVLOCK);
+	asm volatile("ptesync" : : : "memory");
+	if (is_mmio_hpte(v, pte_r))
+		atomic64_inc(&kvm->arch.mmio_update);
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_protect);
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+		   unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	unsigned long v, r;
+	int i, n = 1;
+	struct revmap_entry *rev = NULL;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
+		return H_PARAMETER;
+	if (flags & H_READ_4) {
+		pte_index &= ~3;
+		n = 4;
+	}
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	for (i = 0; i < n; ++i, ++pte_index) {
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
+		r = be64_to_cpu(hpte[1]);
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			v = hpte_new_to_old_v(v, r);
+			r = hpte_new_to_old_r(r);
+		}
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+		}
+		if (v & HPTE_V_VALID) {
+			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
+			r &= ~HPTE_GR_RESERVED;
+		}
+		vcpu->arch.regs.gpr[4 + i * 2] = v;
+		vcpu->arch.regs.gpr[5 + i * 2] = r;
+	}
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_read);
+
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+			unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	unsigned long v, r, gr;
+	struct revmap_entry *rev;
+	unsigned long *rmap;
+	long ret = H_NOT_FOUND;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
+		return H_PARAMETER;
+
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	v = be64_to_cpu(hpte[0]);
+	r = be64_to_cpu(hpte[1]);
+	if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+		goto out;
+
+	gr = rev->guest_rpte;
+	if (rev->guest_rpte & HPTE_R_R) {
+		rev->guest_rpte &= ~HPTE_R_R;
+		note_hpte_modification(kvm, rev);
+	}
+	if (v & HPTE_V_VALID) {
+		gr |= r & (HPTE_R_R | HPTE_R_C);
+		if (r & HPTE_R_R) {
+			kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+			rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL);
+			if (rmap) {
+				lock_rmap(rmap);
+				*rmap |= KVMPPC_RMAP_REFERENCED;
+				unlock_rmap(rmap);
+			}
+		}
+	}
+	vcpu->arch.regs.gpr[4] = gr;
+	ret = H_SUCCESS;
+ out:
+	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_clear_ref);
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+			unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	unsigned long v, r, gr;
+	struct revmap_entry *rev;
+	long ret = H_NOT_FOUND;
+
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
+	if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
+		return H_PARAMETER;
+
+	rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	v = be64_to_cpu(hpte[0]);
+	r = be64_to_cpu(hpte[1]);
+	if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+		goto out;
+
+	gr = rev->guest_rpte;
+	if (gr & HPTE_R_C) {
+		rev->guest_rpte &= ~HPTE_R_C;
+		note_hpte_modification(kvm, rev);
+	}
+	if (v & HPTE_V_VALID) {
+		/* need to make it temporarily absent so C is stable */
+		hpte[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hpte, pte_index);
+		r = be64_to_cpu(hpte[1]);
+		gr |= r & (HPTE_R_R | HPTE_R_C);
+		if (r & HPTE_R_C) {
+			hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
+			eieio();
+			kvmppc_set_dirty_from_hpte(kvm, v, gr);
+		}
+	}
+	vcpu->arch.regs.gpr[4] = gr;
+	ret = H_SUCCESS;
+ out:
+	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_clear_mod);
+
+static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
+			  unsigned long gpa, int writing, unsigned long *hpa,
+			  struct kvm_memory_slot **memslot_p)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn, hva, pa, psize = PAGE_SHIFT;
+	unsigned int shift;
+	pte_t *ptep, pte;
+
+	/* Find the memslot for this address */
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return H_PARAMETER;
+
+	/* Translate to host virtual address */
+	hva = __gfn_to_hva_memslot(memslot, gfn);
+
+	/* Try to find the host pte for that virtual address */
+	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
+	if (!ptep)
+		return H_TOO_HARD;
+	pte = kvmppc_read_update_linux_pte(ptep, writing);
+	if (!pte_present(pte))
+		return H_TOO_HARD;
+
+	/* Convert to a physical address */
+	if (shift)
+		psize = 1UL << shift;
+	pa = pte_pfn(pte) << PAGE_SHIFT;
+	pa |= hva & (psize - 1);
+	pa |= gpa & ~PAGE_MASK;
+
+	if (hpa)
+		*hpa = pa;
+	if (memslot_p)
+		*memslot_p = memslot;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
+				       unsigned long dest)
+{
+	struct kvm_memory_slot *memslot;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long pa, mmu_seq;
+	long ret = H_SUCCESS;
+	int i;
+
+	/* Used later to detect if we might have been invalidated */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
+
+	ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, &pa, &memslot);
+	if (ret != H_SUCCESS)
+		goto out_unlock;
+
+	/* Zero the page */
+	for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES)
+		dcbz((void *)pa);
+	kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
+
+out_unlock:
+	arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
+	return ret;
+}
+
+static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
+				       unsigned long dest, unsigned long src)
+{
+	unsigned long dest_pa, src_pa, mmu_seq;
+	struct kvm_memory_slot *dest_memslot;
+	struct kvm *kvm = vcpu->kvm;
+	long ret = H_SUCCESS;
+
+	/* Used later to detect if we might have been invalidated */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
+	ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, &dest_pa, &dest_memslot);
+	if (ret != H_SUCCESS)
+		goto out_unlock;
+
+	ret = kvmppc_get_hpa(vcpu, mmu_seq, src, 0, &src_pa, NULL);
+	if (ret != H_SUCCESS)
+		goto out_unlock;
+
+	/* Copy the page */
+	memcpy((void *)dest_pa, (void *)src_pa, SZ_4K);
+
+	kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
+
+out_unlock:
+	arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
+	return ret;
+}
+
+long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
+			   unsigned long dest, unsigned long src)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u64 pg_mask = SZ_4K - 1;	/* 4K page size */
+	long ret = H_SUCCESS;
+
+	/* Don't handle radix mode here, go up to the virtual mode handler */
+	if (kvm_is_radix(kvm))
+		return H_TOO_HARD;
+
+	/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
+	if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
+		      H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
+		return H_PARAMETER;
+
+	/* dest (and src if copy_page flag set) must be page aligned */
+	if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
+		return H_PARAMETER;
+
+	/* zero and/or copy the page as determined by the flags */
+	if (flags & H_COPY_PAGE)
+		ret = kvmppc_do_h_page_init_copy(vcpu, dest, src);
+	else if (flags & H_ZERO_PAGE)
+		ret = kvmppc_do_h_page_init_zero(vcpu, dest);
+
+	/* We can ignore the other flags */
+
+	return ret;
+}
+
+void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
+			unsigned long pte_index)
+{
+	unsigned long rb;
+	u64 hp0, hp1;
+
+	hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
+	hp0 = be64_to_cpu(hptep[0]);
+	hp1 = be64_to_cpu(hptep[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hp0 = hpte_new_to_old_v(hp0, hp1);
+		hp1 = hpte_new_to_old_r(hp1);
+	}
+	rb = compute_tlbie_rb(hp0, hp1, pte_index);
+	do_tlbies(kvm, &rb, 1, 1, true);
+}
+EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
+
+void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
+			   unsigned long pte_index)
+{
+	unsigned long rb;
+	unsigned char rbyte;
+	u64 hp0, hp1;
+
+	hp0 = be64_to_cpu(hptep[0]);
+	hp1 = be64_to_cpu(hptep[1]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hp0 = hpte_new_to_old_v(hp0, hp1);
+		hp1 = hpte_new_to_old_r(hp1);
+	}
+	rb = compute_tlbie_rb(hp0, hp1, pte_index);
+	rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
+	/* modify only the second-last byte, which contains the ref bit */
+	*((char *)hptep + 14) = rbyte;
+	do_tlbies(kvm, &rb, 1, 1, false);
+}
+EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
+
+static int slb_base_page_shift[4] = {
+	24,	/* 16M */
+	16,	/* 64k */
+	34,	/* 16G */
+	20,	/* 1M, unsupported */
+};
+
+static struct mmio_hpte_cache_entry *mmio_cache_search(struct kvm_vcpu *vcpu,
+		unsigned long eaddr, unsigned long slb_v, long mmio_update)
+{
+	struct mmio_hpte_cache_entry *entry = NULL;
+	unsigned int pshift;
+	unsigned int i;
+
+	for (i = 0; i < MMIO_HPTE_CACHE_SIZE; i++) {
+		entry = &vcpu->arch.mmio_cache.entry[i];
+		if (entry->mmio_update == mmio_update) {
+			pshift = entry->slb_base_pshift;
+			if ((entry->eaddr >> pshift) == (eaddr >> pshift) &&
+			    entry->slb_v == slb_v)
+				return entry;
+		}
+	}
+	return NULL;
+}
+
+static struct mmio_hpte_cache_entry *
+			next_mmio_cache_entry(struct kvm_vcpu *vcpu)
+{
+	unsigned int index = vcpu->arch.mmio_cache.index;
+
+	vcpu->arch.mmio_cache.index++;
+	if (vcpu->arch.mmio_cache.index == MMIO_HPTE_CACHE_SIZE)
+		vcpu->arch.mmio_cache.index = 0;
+
+	return &vcpu->arch.mmio_cache.entry[index];
+}
+
+/* When called from virtmode, this func should be protected by
+ * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
+ * can trigger deadlock issue.
+ */
+long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
+			      unsigned long valid)
+{
+	unsigned int i;
+	unsigned int pshift;
+	unsigned long somask;
+	unsigned long vsid, hash;
+	unsigned long avpn;
+	__be64 *hpte;
+	unsigned long mask, val;
+	unsigned long v, r, orig_v;
+
+	/* Get page shift, work out hash and AVPN etc. */
+	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
+	val = 0;
+	pshift = 12;
+	if (slb_v & SLB_VSID_L) {
+		mask |= HPTE_V_LARGE;
+		val |= HPTE_V_LARGE;
+		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
+	}
+	if (slb_v & SLB_VSID_B_1T) {
+		somask = (1UL << 40) - 1;
+		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
+		vsid ^= vsid << 25;
+	} else {
+		somask = (1UL << 28) - 1;
+		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
+	}
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt);
+	avpn = slb_v & ~(somask >> 16);	/* also includes B */
+	avpn |= (eaddr & somask) >> 16;
+
+	if (pshift >= 24)
+		avpn &= ~((1UL << (pshift - 16)) - 1);
+	else
+		avpn &= ~0x7fUL;
+	val |= avpn;
+
+	for (;;) {
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7));
+
+		for (i = 0; i < 16; i += 2) {
+			/* Read the PTE racily */
+			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				v = hpte_new_to_old_v(v, be64_to_cpu(hpte[i+1]));
+
+			/* Check valid/absent, hash, segment size and AVPN */
+			if (!(v & valid) || (v & mask) != val)
+				continue;
+
+			/* Lock the PTE and read it under the lock */
+			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
+				cpu_relax();
+			v = orig_v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
+			r = be64_to_cpu(hpte[i+1]);
+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+				v = hpte_new_to_old_v(v, r);
+				r = hpte_new_to_old_r(r);
+			}
+
+			/*
+			 * Check the HPTE again, including base page size
+			 */
+			if ((v & valid) && (v & mask) == val &&
+			    kvmppc_hpte_base_page_shift(v, r) == pshift)
+				/* Return with the HPTE still locked */
+				return (hash << 3) + (i >> 1);
+
+			__unlock_hpte(&hpte[i], orig_v);
+		}
+
+		if (val & HPTE_V_SECONDARY)
+			break;
+		val |= HPTE_V_SECONDARY;
+		hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt);
+	}
+	return -1;
+}
+EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
+
+/*
+ * Called in real mode to check whether an HPTE not found fault
+ * is due to accessing a paged-out page or an emulated MMIO page,
+ * or if a protection fault is due to accessing a page that the
+ * guest wanted read/write access to but which we made read-only.
+ * Returns a possibly modified status (DSISR) value if not
+ * (i.e. pass the interrupt to the guest),
+ * -1 to pass the fault up to host kernel mode code, -2 to do that
+ * and also load the instruction word (for MMIO emulation),
+ * or 0 if we should make the guest retry the access.
+ */
+long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+			  unsigned long slb_v, unsigned int status, bool data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	long int index;
+	unsigned long v, r, gr, orig_v;
+	__be64 *hpte;
+	unsigned long valid;
+	struct revmap_entry *rev;
+	unsigned long pp, key;
+	struct mmio_hpte_cache_entry *cache_entry = NULL;
+	long mmio_update = 0;
+
+	/* For protection fault, expect to find a valid HPTE */
+	valid = HPTE_V_VALID;
+	if (status & DSISR_NOHPTE) {
+		valid |= HPTE_V_ABSENT;
+		mmio_update = atomic64_read(&kvm->arch.mmio_update);
+		cache_entry = mmio_cache_search(vcpu, addr, slb_v, mmio_update);
+	}
+	if (cache_entry) {
+		index = cache_entry->pte_index;
+		v = cache_entry->hpte_v;
+		r = cache_entry->hpte_r;
+		gr = cache_entry->rpte;
+	} else {
+		index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
+		if (index < 0) {
+			if (status & DSISR_NOHPTE)
+				return status;	/* there really was no HPTE */
+			return 0;	/* for prot fault, HPTE disappeared */
+		}
+		hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+		v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
+		r = be64_to_cpu(hpte[1]);
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			v = hpte_new_to_old_v(v, r);
+			r = hpte_new_to_old_r(r);
+		}
+		rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]);
+		gr = rev->guest_rpte;
+
+		unlock_hpte(hpte, orig_v);
+	}
+
+	/* For not found, if the HPTE is valid by now, retry the instruction */
+	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
+		return 0;
+
+	/* Check access permissions to the page */
+	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
+	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
+	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
+	if (!data) {
+		if (gr & (HPTE_R_N | HPTE_R_G))
+			return status | SRR1_ISI_N_G_OR_CIP;
+		if (!hpte_read_permission(pp, slb_v & key))
+			return status | SRR1_ISI_PROT;
+	} else if (status & DSISR_ISSTORE) {
+		/* check write permission */
+		if (!hpte_write_permission(pp, slb_v & key))
+			return status | DSISR_PROTFAULT;
+	} else {
+		if (!hpte_read_permission(pp, slb_v & key))
+			return status | DSISR_PROTFAULT;
+	}
+
+	/* Check storage key, if applicable */
+	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
+		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
+		if (status & DSISR_ISSTORE)
+			perm >>= 1;
+		if (perm & 1)
+			return status | DSISR_KEYFAULT;
+	}
+
+	/* Save HPTE info for virtual-mode handler */
+	vcpu->arch.pgfault_addr = addr;
+	vcpu->arch.pgfault_index = index;
+	vcpu->arch.pgfault_hpte[0] = v;
+	vcpu->arch.pgfault_hpte[1] = r;
+	vcpu->arch.pgfault_cache = cache_entry;
+
+	/* Check the storage key to see if it is possibly emulated MMIO */
+	if ((r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
+		if (!cache_entry) {
+			unsigned int pshift = 12;
+			unsigned int pshift_index;
+
+			if (slb_v & SLB_VSID_L) {
+				pshift_index = ((slb_v & SLB_VSID_LP) >> 4);
+				pshift = slb_base_page_shift[pshift_index];
+			}
+			cache_entry = next_mmio_cache_entry(vcpu);
+			cache_entry->eaddr = addr;
+			cache_entry->slb_base_pshift = pshift;
+			cache_entry->pte_index = index;
+			cache_entry->hpte_v = v;
+			cache_entry->hpte_r = r;
+			cache_entry->rpte = gr;
+			cache_entry->slb_v = slb_v;
+			cache_entry->mmio_update = mmio_update;
+		}
+		if (data && (vcpu->arch.shregs.msr & MSR_IR))
+			return -2;	/* MMIO emulation - load instr word */
+	}
+
+	return -1;		/* send fault up to host kernel mode */
+}
+EXPORT_SYMBOL_GPL(kvmppc_hpte_hv_fault);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644
index 0000000000..e165bfa842
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -0,0 +1,924 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/kernel_stat.h>
+#include <linux/pgtable.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/synch.h>
+#include <asm/cputhreads.h>
+#include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+
+#include "book3s_xics.h"
+
+#define DEBUG_PASSUP
+
+int h_ipi_redirect = 1;
+EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq, bool check_resend);
+static int xics_opal_set_server(unsigned int hw_irq, int server_cpu);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+				struct kvmppc_ics *ics, struct kvmppc_icp *icp)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+		if (state->resend)
+			icp_rm_deliver_irq(xics, icp, state->number, true);
+	}
+
+}
+
+/* -- ICP routines -- */
+
+#ifdef CONFIG_SMP
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
+{
+	int hcpu;
+
+	hcpu = hcore << threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
+	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
+	kvmppc_set_host_ipi(hcpu);
+	smp_mb();
+	kvmhv_rm_send_ipi(hcpu);
+}
+#else
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
+#endif
+
+/*
+ * We start the search from our current CPU Id in the core map
+ * and go in a circle until we get back to our ID looking for a
+ * core that is running in host context and that hasn't already
+ * been targeted for another rm_host_ops.
+ *
+ * In the future, could consider using a fairer algorithm (one
+ * that distributes the IPIs better)
+ *
+ * Returns -1, if no CPU could be found in the host
+ * Else, returns a CPU Id which has been reserved for use
+ */
+static inline int grab_next_hostcore(int start,
+		struct kvmppc_host_rm_core *rm_core, int max, int action)
+{
+	bool success;
+	int core;
+	union kvmppc_rm_state old, new;
+
+	for (core = start + 1; core < max; core++)  {
+		old = new = READ_ONCE(rm_core[core].rm_state);
+
+		if (!old.in_host || old.rm_action)
+			continue;
+
+		/* Try to grab this host core if not taken already. */
+		new.rm_action = action;
+
+		success = cmpxchg64(&rm_core[core].rm_state.raw,
+						old.raw, new.raw) == old.raw;
+		if (success) {
+			/*
+			 * Make sure that the store to the rm_action is made
+			 * visible before we return to caller (and the
+			 * subsequent store to rm_data) to synchronize with
+			 * the IPI handler.
+			 */
+			smp_wmb();
+			return core;
+		}
+	}
+
+	return -1;
+}
+
+static inline int find_available_hostcore(int action)
+{
+	int core;
+	int my_core = smp_processor_id() >> threads_shift;
+	struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core;
+
+	core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action);
+	if (core == -1)
+		core = grab_next_hostcore(core, rm_core, my_core, action);
+
+	return core;
+}
+
+static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
+				struct kvm_vcpu *this_vcpu)
+{
+	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
+	int cpu;
+	int hcore;
+
+	/* Mark the target VCPU as having an interrupt pending */
+	vcpu->stat.queue_intr++;
+	set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+
+	/* Kick self ? Just set MER and return */
+	if (vcpu == this_vcpu) {
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
+		return;
+	}
+
+	/*
+	 * Check if the core is loaded,
+	 * if not, find an available host core to post to wake the VCPU,
+	 * if we can't find one, set up state to eventually return too hard.
+	 */
+	cpu = vcpu->arch.thread_cpu;
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		hcore = -1;
+		if (kvmppc_host_rm_ops_hv && h_ipi_redirect)
+			hcore = find_available_hostcore(XICS_RM_KICK_VCPU);
+		if (hcore != -1) {
+			icp_send_hcore_msg(hcore, vcpu);
+		} else {
+			this_icp->rm_action |= XICS_RM_KICK_VCPU;
+			this_icp->rm_kick_target = vcpu;
+		}
+		return;
+	}
+
+	smp_mb();
+	kvmhv_rm_send_ipi(cpu);
+}
+
+static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
+{
+	/* Note: Only called on self ! */
+	clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
+}
+
+static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
+				     union kvmppc_icp_state old,
+				     union kvmppc_icp_state new)
+{
+	struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee)
+		icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
+
+	/* Expose the state change for debug purposes */
+	this_vcpu->arch.icp->rm_dbgstate = new;
+	this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
+
+ bail:
+	return success;
+}
+
+static inline int check_too_hard(struct kvmppc_xics *xics,
+				 struct kvmppc_icp *icp)
+{
+	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
+}
+
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_rm_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq, bool check_resend)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		/* Unsafe increment, but this does not need to be accurate */
+		xics->err_noics++;
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	arch_spin_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			/* Unsafe increment again*/
+			xics->err_noicp++;
+			goto out;
+		}
+	}
+
+	if (check_resend)
+		if (!state->resend)
+			goto out;
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differentiate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * ics spin lock.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_rm_try_to_deliver() the target
+	 * processor may well have already consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			arch_spin_unlock(&ics->lock);
+			icp->n_reject++;
+			new_irq = reject;
+			check_resend = 0;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		state->resend = 1;
+
+		/*
+		 * Make sure when checking resend, we don't miss the resend
+		 * if resend_map bit is seen and cleared.
+		 */
+		smp_wmb();
+		set_bit(ics->icsid, icp->resend_map);
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_rm_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			state->resend = 0;
+			arch_spin_unlock(&ics->lock);
+			check_resend = 0;
+			goto again;
+		}
+	}
+ out:
+	arch_spin_unlock(&ics->lock);
+}
+
+static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			     u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here as well.
+	 */
+	if (resend) {
+		icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
+	}
+}
+
+unsigned long xics_rm_h_xirr_x(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.regs.gpr[5] = get_tb();
+	return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/* First clear the interrupt */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Return the result in GPR4 */
+	vcpu->arch.regs.gpr[4] = xirr;
+
+	return check_too_hard(xics, icp);
+}
+
+int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		  unsigned long mfrr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	local = this_icp->server_num == server;
+	if (local)
+		icp = this_icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be done as there can be no XISR to
+	 * reject.
+	 *
+	 * ICP state: Check_IPI
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it.
+	 *
+	 * ICP State: IPI
+	 *
+	 * Besides rejecting any pending interrupts, we also
+	 * update XISR and pending_pri to mark IPI as pending.
+	 *
+	 * PAPR does not describe this state, but if the MFRR is being
+	 * made less favored than its earlier value, there might be
+	 * a previously-rejected interrupt needing to be resent.
+	 * Ideally, we would want to resend only if
+	 *	prio(pending_interrupt) < mfrr &&
+	 *	prio(pending_interrupt) < cppr
+	 * where pending interrupt is the one that was rejected. But
+	 * we don't have that state, so we simply trigger a resend
+	 * whenever the MFRR is made less favored.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri) {
+				reject = new_state.xisr;
+				new_state.pending_pri = mfrr;
+				new_state.xisr = XICS_IPI;
+			}
+		}
+
+		if (mfrr > old_state.mfrr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Handle reject in real mode */
+	if (reject && reject != XICS_IPI) {
+		this_icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject, false);
+	}
+
+	/* Handle resends in real mode */
+	if (resend) {
+		this_icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
+	}
+
+	return check_too_hard(xics, this_icp);
+}
+
+int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr) {
+		icp_rm_down_cppr(xics, icp, cppr);
+		goto bail;
+	} else if (cppr == icp->state.cppr)
+		return H_SUCCESS;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_rm_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI) {
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject, false);
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
+
+static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	u32 pq_old, pq_new;
+
+	/*
+	 * ICS EOI handling: For LSI, if P bit is still set, we need to
+	 * resend it.
+	 *
+	 * For MSI, we move Q bit into P (and clear Q). If it is set,
+	 * resend it.
+	 */
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		goto bail;
+
+	state = &ics->irq_state[src];
+
+	if (state->lsi)
+		pq_new = state->pq_state;
+	else
+		do {
+			pq_old = state->pq_state;
+			pq_new = pq_old >> 1;
+		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	if (pq_new & PQ_PRESENTED)
+		icp_rm_deliver_irq(xics, NULL, irq, false);
+
+	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
+		icp->rm_action |= XICS_RM_NOTIFY_EOI;
+		icp->rm_eoied_irq = irq;
+	}
+
+	/* Handle passthrough interrupts */
+	if (state->host_irq) {
+		++vcpu->stat.pthru_all;
+		if (state->intr_cpu != -1) {
+			int pcpu = raw_smp_processor_id();
+
+			pcpu = cpu_first_thread_sibling(pcpu);
+			++vcpu->stat.pthru_host;
+			if (state->intr_cpu != pcpu) {
+				++vcpu->stat.pthru_bad_aff;
+				xics_opal_set_server(state->host_irq, pcpu);
+			}
+			state->intr_cpu = -1;
+		}
+	}
+
+ bail:
+	return check_too_hard(xics, icp);
+}
+
+int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 irq = xirr & 0x00ffffff;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR specifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return check_too_hard(xics, icp);
+
+	return ics_rm_eoi(vcpu, irq);
+}
+
+static unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again)
+{
+	void __iomem *xics_phys;
+	int64_t rc;
+
+	rc = pnv_opal_pci_msi_eoi(d);
+
+	if (rc)
+		eoi_rc = rc;
+
+	iosync();
+
+	/* EOI it */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	if (xics_phys) {
+		__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
+	} else {
+		rc = opal_int_eoi(be32_to_cpu(xirr));
+		*again = rc > 0;
+	}
+}
+
+static int xics_opal_set_server(unsigned int hw_irq, int server_cpu)
+{
+	unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+	return opal_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+	unsigned long l;
+	unsigned int *raddr;
+	int cpu = smp_processor_id();
+
+	raddr = per_cpu_ptr(addr, cpu);
+	l = (unsigned long)raddr;
+
+	if (get_region_id(l) == VMALLOC_REGION_ID) {
+		l = vmalloc_to_phys(raddr);
+		raddr = (unsigned int *)l;
+	}
+	++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ *  - state flags represent internal IRQ state and are not expected to be
+ *    updated outside the IRQ subsystem
+ *  - more importantly, these are useful for edge triggered interrupts,
+ *    IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ *    and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+	this_cpu_inc_rm(desc->kstat_irqs);
+	__this_cpu_inc(kstat.irqs_sum);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+				 __be32 xirr,
+				 struct kvmppc_irq_map *irq_map,
+				 struct kvmppc_passthru_irqmap *pimap,
+				 bool *again)
+{
+	struct kvmppc_xics *xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq;
+	u16 src;
+	u32 pq_old, pq_new;
+
+	irq = irq_map->v_hwirq;
+	xics = vcpu->kvm->arch.xics;
+	icp = vcpu->arch.icp;
+
+	kvmppc_rm_handle_irq_desc(irq_map->desc);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return 2;
+
+	state = &ics->irq_state[src];
+
+	/* only MSIs register bypass producers, so it must be MSI here */
+	do {
+		pq_old = state->pq_state;
+		pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+	} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	/* Test P=1, Q=0, this is the only case where we present */
+	if (pq_new == PQ_PRESENTED)
+		icp_rm_deliver_irq(xics, icp, irq, false);
+
+	/* EOI the interrupt */
+	icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, again);
+
+	if (check_too_hard(xics, icp) == H_TOO_HARD)
+		return 2;
+	else
+		return -2;
+}
+
+/*  --- Non-real mode XICS-related built-in routines ---  */
+
+/*
+ * Host Operations poked by RM KVM
+ */
+static void rm_host_ipi_action(int action, void *data)
+{
+	switch (action) {
+	case XICS_RM_KICK_VCPU:
+		kvmppc_host_rm_ops_hv->vcpu_kick(data);
+		break;
+	default:
+		WARN(1, "Unexpected rm_action=%d data=%p\n", action, data);
+		break;
+	}
+
+}
+
+void kvmppc_xics_ipi_action(void)
+{
+	int core;
+	unsigned int cpu = smp_processor_id();
+	struct kvmppc_host_rm_core *rm_corep;
+
+	core = cpu >> threads_shift;
+	rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core];
+
+	if (rm_corep->rm_data) {
+		rm_host_ipi_action(rm_corep->rm_state.rm_action,
+							rm_corep->rm_data);
+		/* Order these stores against the real mode KVM */
+		rm_corep->rm_data = NULL;
+		smp_wmb();
+		rm_corep->rm_state.rm_action = 0;
+	}
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 0000000000..ea7ad200b3
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,3026 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+#include <asm/ppc_asm.h>
+#include <asm/code-patching-asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/book3s/64/mmu-hash.h>
+#include <asm/tm.h>
+#include <asm/opal.h>
+#include <asm/thread_info.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+#include <asm/cpuidle.h>
+
+/* Values in HSTATE_NAPPING(r13) */
+#define NAPPING_CEDE	1
+#define NAPPING_NOVCPU	2
+#define NAPPING_UNSPLIT	3
+
+/* Stack frame offsets for kvmppc_hv_entry */
+#define SFS			160
+#define STACK_SLOT_TRAP		(SFS-4)
+#define STACK_SLOT_TID		(SFS-16)
+#define STACK_SLOT_PSSCR	(SFS-24)
+#define STACK_SLOT_PID		(SFS-32)
+#define STACK_SLOT_IAMR		(SFS-40)
+#define STACK_SLOT_CIABR	(SFS-48)
+#define STACK_SLOT_DAWR0	(SFS-56)
+#define STACK_SLOT_DAWRX0	(SFS-64)
+#define STACK_SLOT_HFSCR	(SFS-72)
+#define STACK_SLOT_AMR		(SFS-80)
+#define STACK_SLOT_UAMOR	(SFS-88)
+#define STACK_SLOT_FSCR		(SFS-96)
+
+/*
+ * Use the last LPID (all implemented LPID bits = 1) for partition switching.
+ * This is reserved in the LPID allocator. POWER7 only implements 0x3ff, but
+ * we write 0xfff into the LPID SPR anyway, which seems to work and just
+ * ignores the top bits.
+ */
+#define   LPID_RSVD		0xfff
+
+/*
+ * Call kvmppc_hv_entry in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * LR = return address to continue at after eventually re-enabling MMU
+ */
+_GLOBAL_TOC(kvmppc_hv_entry_trampoline)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -112(r1)
+	mfmsr	r10
+	std	r10, HSTATE_HOST_MSR(r13)
+	LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
+	li	r0,MSR_RI
+	andc	r0,r10,r0
+	li	r6,MSR_IR | MSR_DR
+	andc	r6,r10,r6
+	mtmsrd	r0,1		/* clear RI in MSR */
+	mtsrr0	r5
+	mtsrr1	r6
+	RFI_TO_KERNEL
+
+kvmppc_call_hv_entry:
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	bl	kvmppc_hv_entry
+
+	/* Back from guest - restore host state and return to caller */
+
+BEGIN_FTR_SECTION
+	/* Restore host DABR and DABRX */
+	ld	r5,HSTATE_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+	/* Restore SPRG3 */
+	ld	r3,PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_SPRG_VDSO_WRITE,r3
+
+	/* Reload the host's PMU registers */
+	bl	kvmhv_load_host_pmu
+
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
+	/* hwthread_req may have got set by cede or no vcpu, so clear it */
+	li	r0, 0
+	stb	r0, HSTATE_HWTHREAD_REQ(r13)
+
+	/*
+	 * For external interrupts we need to call the Linux
+	 * handler to process the interrupt. We do that by jumping
+	 * to absolute address 0x500 for external interrupts.
+	 * The [h]rfid at the end of the handler will return to
+	 * the book3s_hv_interrupts.S code. For other interrupts
+	 * we do the rfid to get back to the book3s_hv_interrupts.S
+	 * code here.
+	 */
+	ld	r8, 112+PPC_LR_STKOFF(r1)
+	addi	r1, r1, 112
+	ld	r7, HSTATE_HOST_MSR(r13)
+
+	/* Return the trap number on this thread as the return value */
+	mr	r3, r12
+
+	/* RFI into the highmem handler */
+	mfmsr	r6
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+	mtmsrd	r6, 1			/* Clear RI in MSR */
+	mtsrr0	r8
+	mtsrr1	r7
+	RFI_TO_KERNEL
+
+kvmppc_primary_no_guest:
+	/* We handle this much like a ceded vcpu */
+	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+	/* HDEC may be larger than DEC for arch >= v3.00, but since the */
+	/* HDEC value came from DEC in the first place, it will fit */
+	mfspr	r3, SPRN_HDEC
+	mtspr	SPRN_DEC, r3
+	/*
+	 * Make sure the primary has finished the MMU switch.
+	 * We should never get here on a secondary thread, but
+	 * check it for robustness' sake.
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+65:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	65b
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+	/* set our bit in napping_threads */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+1:	lwarx	r3, 0, r6
+	or	r3, r3, r0
+	stwcx.	r3, 0, r6
+	bne	1b
+	/* order napping_threads update vs testing entry_exit_map */
+	isync
+	li	r12, 0
+	lwz	r7, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7, 0x100
+	bge	kvm_novcpu_exit	/* another thread already exiting */
+	li	r3, NAPPING_NOVCPU
+	stb	r3, HSTATE_NAPPING(r13)
+
+	li	r3, 0		/* Don't wake on privileged (OS) doorbell */
+	b	kvm_do_nap
+
+/*
+ * kvm_novcpu_wakeup
+ *	Entered from kvm_start_guest if kvm_hstate.napping is set
+ *	to NAPPING_NOVCPU
+ *		r2 = kernel TOC
+ *		r13 = paca
+ */
+kvm_novcpu_wakeup:
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+
+	/* check the wake reason */
+	bl	kvmppc_check_wake_reason
+
+	/*
+	 * Restore volatile registers since we could have called
+	 * a C routine in kvmppc_check_wake_reason.
+	 *	r5 = VCORE
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+
+	/* see if any other thread is already exiting */
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	bge	kvm_novcpu_exit
+
+	/* clear our bit in napping_threads */
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+4:	lwarx	r7, 0, r6
+	andc	r7, r7, r0
+	stwcx.	r7, 0, r6
+	bne	4b
+
+	/* See if the wake reason means we need to exit */
+	cmpdi	r3, 0
+	bge	kvm_novcpu_exit
+
+	/* See if our timeslice has expired (HDEC is negative) */
+	mfspr	r0, SPRN_HDEC
+	extsw	r0, r0
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	cmpdi	r0, 0
+	blt	kvm_novcpu_exit
+
+	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+#endif
+	b	kvmppc_got_guest
+
+kvm_novcpu_exit:
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	13f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+13:	mr	r3, r12
+	stw	r12, STACK_SLOT_TRAP(r1)
+	bl	kvmhv_commence_exit
+	nop
+	b	kvmhv_switch_to_host
+
+/*
+ * We come in here when wakened from Linux offline idle code.
+ * Relocation is off
+ * r3 contains the SRR1 wakeup value, SRR1 is trashed.
+ */
+_GLOBAL(idle_kvm_start_guest)
+	mfcr	r5
+	mflr	r0
+	std	r5, 8(r1)	// Save CR in caller's frame
+	std	r0, 16(r1)	// Save LR in caller's frame
+	// Create frame on emergency stack
+	ld	r4, PACAEMERGSP(r13)
+	stdu	r1, -SWITCH_FRAME_SIZE(r4)
+	// Switch to new frame on emergency stack
+	mr	r1, r4
+	std	r3, 32(r1)	// Save SRR1 wakeup value
+	SAVE_NVGPRS(r1)
+
+	/*
+	 * Could avoid this and pass it through in r3. For now,
+	 * code expects it to be in SRR1.
+	 */
+	mtspr	SPRN_SRR1,r3
+
+	li	r0,0
+	stb	r0,PACA_FTRACE_ENABLED(r13)
+
+	li	r0,KVM_HWTHREAD_IN_KVM
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+
+	/* kvm cede / napping does not come through here */
+	lbz	r0,HSTATE_NAPPING(r13)
+	twnei	r0,0
+
+	b	1f
+
+kvm_unsplit_wakeup:
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+
+1:
+
+	/*
+	 * We weren't napping due to cede, so this must be a secondary
+	 * thread being woken up to run a guest, or being woken up due
+	 * to a stray IPI.  (Or due to some machine check or hypervisor
+	 * maintenance interrupt while the core is in KVM.)
+	 */
+
+	/* Check the wake reason in SRR1 to see why we got here */
+	bl	kvmppc_check_wake_reason
+	/*
+	 * kvmppc_check_wake_reason could invoke a C routine, but we
+	 * have no volatile registers to restore when we return.
+	 */
+
+	cmpdi	r3, 0
+	bge	kvm_no_guest
+
+	/* get vcore pointer, NULL if we have nothing to run */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	cmpdi	r5,0
+	/* if we have no vcore to run, go back to sleep */
+	beq	kvm_no_guest
+
+kvm_secondary_got_guest:
+
+	// About to go to guest, clear saved SRR1
+	li	r0, 0
+	std	r0, 32(r1)
+
+	/* Set HSTATE_DSCR(r13) to something sensible */
+	ld	r6, PACA_DSCR_DEFAULT(r13)
+	std	r6, HSTATE_DSCR(r13)
+
+	/* On thread 0 of a subcore, set HDEC to max */
+	lbz	r4, HSTATE_PTID(r13)
+	cmpwi	r4, 0
+	bne	63f
+	lis	r6,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC, r6
+	/* and set per-LPAR registers, if doing dynamic micro-threading */
+	ld	r6, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r6, 0
+	beq	63f
+	ld	r0, KVM_SPLIT_RPR(r6)
+	mtspr	SPRN_RPR, r0
+	ld	r0, KVM_SPLIT_PMMAR(r6)
+	mtspr	SPRN_PMMAR, r0
+	ld	r0, KVM_SPLIT_LDBAR(r6)
+	mtspr	SPRN_LDBAR, r0
+	isync
+63:
+	/* Order load of vcpu after load of vcore */
+	lwsync
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	bl	kvmppc_hv_entry
+
+	/* Back from the guest, go back to nap */
+	/* Clear our vcpu and vcore pointers so we don't come back in early */
+	li	r0, 0
+	std	r0, HSTATE_KVM_VCPU(r13)
+	/*
+	 * Once we clear HSTATE_KVM_VCORE(r13), the code in
+	 * kvmppc_run_core() is going to assume that all our vcpu
+	 * state is visible in memory.  This lwsync makes sure
+	 * that that is true.
+	 */
+	lwsync
+	std	r0, HSTATE_KVM_VCORE(r13)
+
+	/*
+	 * All secondaries exiting guest will fall through this path.
+	 * Before proceeding, just check for HMI interrupt and
+	 * invoke opal hmi handler. By now we are sure that the
+	 * primary thread on this core/subcore has already made partition
+	 * switch/TB resync and we are good to call opal hmi handler.
+	 */
+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
+	bne	kvm_no_guest
+
+	li	r3,0			/* NULL argument */
+	bl	CFUNC(hmi_exception_realmode)
+/*
+ * At this point we have finished executing in the guest.
+ * We need to wait for hwthread_req to become zero, since
+ * we may not turn on the MMU while hwthread_req is non-zero.
+ * While waiting we also need to check if we get given a vcpu to run.
+ */
+kvm_no_guest:
+	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r3, 0
+	bne	53f
+	HMT_MEDIUM
+	li	r0, KVM_HWTHREAD_IN_KERNEL
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+	/* need to recheck hwthread_req after a barrier, to avoid race */
+	sync
+	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r3, 0
+	bne	54f
+
+	/*
+	 * Jump to idle_return_gpr_loss, which returns to the
+	 * idle_kvm_start_guest caller.
+	 */
+	li	r3, LPCR_PECE0
+	mfspr	r4, SPRN_LPCR
+	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
+	mtspr	SPRN_LPCR, r4
+	// Return SRR1 wakeup value, or 0 if we went into the guest
+	ld	r3, 32(r1)
+	REST_NVGPRS(r1)
+	ld	r1, 0(r1)	// Switch back to caller stack
+	ld	r0, 16(r1)	// Reload LR
+	ld	r5, 8(r1)	// Reload CR
+	mtlr	r0
+	mtcr	r5
+	blr
+
+53:
+	HMT_LOW
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	cmpdi	r5, 0
+	bne	60f
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r3, 0
+	beq	kvm_no_guest
+	lbz	r0, KVM_SPLIT_DO_NAP(r3)
+	cmpwi	r0, 0
+	beq	kvm_no_guest
+	HMT_MEDIUM
+	b	kvm_unsplit_nap
+60:	HMT_MEDIUM
+	b	kvm_secondary_got_guest
+
+54:	li	r0, KVM_HWTHREAD_IN_KVM
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+	b	kvm_no_guest
+
+/*
+ * Here the primary thread is trying to return the core to
+ * whole-core mode, so we need to nap.
+ */
+kvm_unsplit_nap:
+	/*
+	 * When secondaries are napping in kvm_unsplit_nap() with
+	 * hwthread_req = 1, HMI goes ignored even though subcores are
+	 * already exited the guest. Hence HMI keeps waking up secondaries
+	 * from nap in a loop and secondaries always go back to nap since
+	 * no vcore is assigned to them. This makes impossible for primary
+	 * thread to get hold of secondary threads resulting into a soft
+	 * lockup in KVM path.
+	 *
+	 * Let us check if HMI is pending and handle it before we go to nap.
+	 */
+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
+	bne	55f
+	li	r3, 0			/* NULL argument */
+	bl	CFUNC(hmi_exception_realmode)
+55:
+	/*
+	 * Ensure that secondary doesn't nap when it has
+	 * its vcore pointer set.
+	 */
+	sync		/* matches smp_mb() before setting split_info.do_nap */
+	ld	r0, HSTATE_KVM_VCORE(r13)
+	cmpdi	r0, 0
+	bne	kvm_no_guest
+	/* clear any pending message */
+BEGIN_FTR_SECTION
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	/* Set kvm_split_mode.napped[tid] = 1 */
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	li	r0, 1
+	lhz	r4, PACAPACAINDEX(r13)
+	clrldi	r4, r4, 61	/* micro-threading => P8 => 8 threads/core */
+	addi	r4, r4, KVM_SPLIT_NAPPED
+	stbx	r0, r3, r4
+	/* Check the do_nap flag again after setting napped[] */
+	sync
+	lbz	r0, KVM_SPLIT_DO_NAP(r3)
+	cmpwi	r0, 0
+	beq	57f
+	li	r3, NAPPING_UNSPLIT
+	stb	r3, HSTATE_NAPPING(r13)
+	li	r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
+	mfspr	r5, SPRN_LPCR
+	rlwimi	r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
+	b	kvm_nap_sequence
+
+57:	li	r0, 0
+	stbx	r0, r3, r4
+	b	kvm_no_guest
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+SYM_CODE_START_LOCAL(kvmppc_hv_entry)
+
+	/* Required state:
+	 *
+	 * R4 = vcpu pointer (or NULL)
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * R2 = TOC
+	 * all other volatile GPRS = free
+	 * Does not preserve non-volatile GPRs or CR fields
+	 */
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -SFS(r1)
+
+	/* Save R1 in the PACA */
+	std	r1, HSTATE_HOST_R1(r13)
+
+	li	r6, KVM_GUEST_MODE_HOST_HV
+	stb	r6, HSTATE_IN_GUEST(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	/* Store initial timestamp */
+	cmpdi	r4, 0
+	beq	1f
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+1:
+#endif
+
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r9, VCORE_KVM(r5)	/* pointer to struct kvm */
+
+	/*
+	 * POWER7/POWER8 host -> guest partition switch code.
+	 * We don't have to lock against concurrent tlbies,
+	 * but we do have to coordinate across hardware threads.
+	 */
+	/* Set bit in entry map iff exit map is zero. */
+	li	r7, 1
+	lbz	r6, HSTATE_PTID(r13)
+	sld	r7, r7, r6
+	addi	r8, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r8
+	cmpwi	r3, 0x100		/* any threads starting to exit? */
+	bge	secondary_too_late	/* if so we're too late to the party */
+	or	r3, r3, r7
+	stwcx.	r3, 0, r8
+	bne	21b
+
+	/* Primary thread switches to guest partition. */
+	cmpwi	r6,0
+	bne	10f
+
+	lwz	r7,KVM_LPID(r9)
+	ld	r6,KVM_SDR1(r9)
+	li	r0,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r0
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+
+	/* See if we need to flush the TLB. */
+	mr	r3, r9			/* kvm pointer */
+	lhz	r4, PACAPACAINDEX(r13)	/* physical cpu number */
+	li	r5, 0			/* nested vcpu pointer */
+	bl	kvmppc_check_need_tlb_flush
+	nop
+	ld	r5, HSTATE_KVM_VCORE(r13)
+
+	/* Add timebase offset onto timebase */
+22:	ld	r8,VCORE_TB_OFFSET(r5)
+	cmpdi	r8,0
+	beq	37f
+	std	r8, VCORE_TB_OFFSET_APPL(r5)
+	mftb	r6		/* current host timebase */
+	add	r8,r8,r6
+	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
+	mftb	r7		/* check if lower 24 bits overflowed */
+	clrldi	r6,r6,40
+	clrldi	r7,r7,40
+	cmpld	r7,r6
+	bge	37f
+	addis	r8,r8,0x100	/* if so, increment upper 40 bits */
+	mtspr	SPRN_TBU40,r8
+
+	/* Load guest PCR value to select appropriate compat mode */
+37:	ld	r7, VCORE_PCR(r5)
+	LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+	cmpld	r7, r6
+	beq	38f
+	or	r7, r7, r6
+	mtspr	SPRN_PCR, r7
+38:
+
+BEGIN_FTR_SECTION
+	/* DPDES and VTB are shared between threads */
+	ld	r8, VCORE_DPDES(r5)
+	ld	r7, VCORE_VTB(r5)
+	mtspr	SPRN_DPDES, r8
+	mtspr	SPRN_VTB, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	/* Mark the subcore state as inside guest */
+	bl	kvmppc_subcore_enter_guest
+	nop
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	li	r0,1
+	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
+
+	/* Do we have a guest vcpu to run? */
+10:	cmpdi	r4, 0
+	beq	kvmppc_primary_no_guest
+kvmppc_got_guest:
+	/* Increment yield count if they have a VPA */
+	ld	r3, VCPU_VPA(r4)
+	cmpdi	r3, 0
+	beq	25f
+	li	r6, LPPACA_YIELDCOUNT
+	LWZX_BE	r5, r3, r6
+	addi	r5, r5, 1
+	STWX_BE	r5, r3, r6
+	li	r6, 1
+	stb	r6, VCPU_VPA_DIRTY(r4)
+25:
+
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,HSTATE_PURR(r13)
+	std	r6,HSTATE_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+
+	/* Save host values of some registers */
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_CIABR
+	mfspr	r6, SPRN_DAWR0
+	mfspr	r7, SPRN_DAWRX0
+	mfspr	r8, SPRN_IAMR
+	std	r5, STACK_SLOT_CIABR(r1)
+	std	r6, STACK_SLOT_DAWR0(r1)
+	std	r7, STACK_SLOT_DAWRX0(r1)
+	std	r8, STACK_SLOT_IAMR(r1)
+	mfspr	r5, SPRN_FSCR
+	std	r5, STACK_SLOT_FSCR(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	mfspr	r5, SPRN_AMR
+	std	r5, STACK_SLOT_AMR(r1)
+	mfspr	r6, SPRN_UAMOR
+	std	r6, STACK_SLOT_UAMOR(r1)
+
+BEGIN_FTR_SECTION
+	/* Set partition DABR */
+	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
+	lwz	r5,VCPU_DABRX(r4)
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+	isync
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	b	91f
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
+	 */
+	mr      r3, r4
+	ld      r4, VCPU_MSR(r3)
+	li	r5, 0			/* don't preserve non-vol regs */
+	bl	kvmppc_restore_tm_hv
+	nop
+	ld	r4, HSTATE_KVM_VCPU(r13)
+91:
+#endif
+
+	/* Load guest PMU registers; r4 = vcpu pointer here */
+	mr	r3, r4
+	bl	kvmhv_load_guest_pmu
+
+	/* Load up FP, VMX and VSX registers */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	bl	kvmppc_load_fp
+
+	ld	r14, VCPU_GPR(R14)(r4)
+	ld	r15, VCPU_GPR(R15)(r4)
+	ld	r16, VCPU_GPR(R16)(r4)
+	ld	r17, VCPU_GPR(R17)(r4)
+	ld	r18, VCPU_GPR(R18)(r4)
+	ld	r19, VCPU_GPR(R19)(r4)
+	ld	r20, VCPU_GPR(R20)(r4)
+	ld	r21, VCPU_GPR(R21)(r4)
+	ld	r22, VCPU_GPR(R22)(r4)
+	ld	r23, VCPU_GPR(R23)(r4)
+	ld	r24, VCPU_GPR(R24)(r4)
+	ld	r25, VCPU_GPR(R25)(r4)
+	ld	r26, VCPU_GPR(R26)(r4)
+	ld	r27, VCPU_GPR(R27)(r4)
+	ld	r28, VCPU_GPR(R28)(r4)
+	ld	r29, VCPU_GPR(R29)(r4)
+	ld	r30, VCPU_GPR(R30)(r4)
+	ld	r31, VCPU_GPR(R31)(r4)
+
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+
+BEGIN_FTR_SECTION
+	/* Skip next section on POWER7 */
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Load up POWER8-specific registers */
+	ld	r5, VCPU_IAMR(r4)
+	lwz	r6, VCPU_PSPB(r4)
+	ld	r7, VCPU_FSCR(r4)
+	mtspr	SPRN_IAMR, r5
+	mtspr	SPRN_PSPB, r6
+	mtspr	SPRN_FSCR, r7
+	/*
+	 * Handle broken DAWR case by not writing it. This means we
+	 * can still store the DAWR register for migration.
+	 */
+	LOAD_REG_ADDR(r5, dawr_force_enable)
+	lbz	r5, 0(r5)
+	cmpdi	r5, 0
+	beq	1f
+	ld	r5, VCPU_DAWR0(r4)
+	ld	r6, VCPU_DAWRX0(r4)
+	mtspr	SPRN_DAWR0, r5
+	mtspr	SPRN_DAWRX0, r6
+1:
+	ld	r7, VCPU_CIABR(r4)
+	ld	r8, VCPU_TAR(r4)
+	mtspr	SPRN_CIABR, r7
+	mtspr	SPRN_TAR, r8
+	ld	r5, VCPU_IC(r4)
+	ld	r8, VCPU_EBBHR(r4)
+	mtspr	SPRN_IC, r5
+	mtspr	SPRN_EBBHR, r8
+	ld	r5, VCPU_EBBRR(r4)
+	ld	r6, VCPU_BESCR(r4)
+	lwz	r7, VCPU_GUEST_PID(r4)
+	ld	r8, VCPU_WORT(r4)
+	mtspr	SPRN_EBBRR, r5
+	mtspr	SPRN_BESCR, r6
+	mtspr	SPRN_PID, r7
+	mtspr	SPRN_WORT, r8
+	/* POWER8-only registers */
+	ld	r5, VCPU_TCSCR(r4)
+	ld	r6, VCPU_ACOP(r4)
+	ld	r7, VCPU_CSIGR(r4)
+	ld	r8, VCPU_TACR(r4)
+	mtspr	SPRN_TCSCR, r5
+	mtspr	SPRN_ACOP, r6
+	mtspr	SPRN_CSIGR, r7
+	mtspr	SPRN_TACR, r8
+	nop
+8:
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+
+	/* Restore state of CTRL run bit; the host currently has it set to 1 */
+	lwz	r5,VCPU_CTRL(r4)
+	andi.	r5,r5,1
+	bne	4f
+	li	r6,0
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Secondary threads wait for primary to have done partition switch */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, HSTATE_PTID(r13)
+	cmpwi	r6, 0
+	beq	21f
+	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	bne	21f
+	HMT_LOW
+20:	lwz	r3, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r3, 0x100
+	bge	no_switch_exit
+	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	20b
+	HMT_MEDIUM
+21:
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+
+	/* Check if HDEC expires soon */
+	mfspr	r3, SPRN_HDEC
+	extsw	r3, r3
+	cmpdi	r3, 512		/* 1 microsecond */
+	blt	hdec_soon
+
+	/* Clear out and reload the SLB */
+	li	r6, 0
+	slbmte	r6, r6
+	PPC_SLBIA(6)
+	ptesync
+
+	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
+deliver_guest_interrupt:	/* r4 = vcpu, r13 = paca */
+	/* Check if we can deliver an external or decrementer interrupt now */
+	ld	r0, VCPU_PENDING_EXC(r4)
+	cmpdi	r0, 0
+	beq	71f
+	mr	r3, r4
+	bl	CFUNC(kvmppc_guest_entry_inject_int)
+	ld	r4, HSTATE_KVM_VCPU(r13)
+71:
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+
+	ld	r10, VCPU_PC(r4)
+	ld	r11, VCPU_MSR(r4)
+	/* r11 = vcpu->arch.msr & ~MSR_HV */
+	rldicl	r11, r11, 63 - MSR_HV_LG, 1
+	rotldi	r11, r11, 1 + MSR_HV_LG
+	ori	r11, r11, MSR_ME
+
+	ld	r6, VCPU_CTR(r4)
+	ld	r7, VCPU_XER(r4)
+	mtctr	r6
+	mtxer	r7
+
+/*
+ * Required state:
+ * R4 = vcpu
+ * R10: value for HSRR0
+ * R11: value for HSRR1
+ * R13 = PACA
+ */
+fast_guest_return:
+	li	r0,0
+	stb	r0,VCPU_CEDED(r4)	/* cancel cede */
+	mtspr	SPRN_HSRR0,r10
+	mtspr	SPRN_HSRR1,r11
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r9, KVM_GUEST_MODE_GUEST_HV
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	/* Accumulate timing */
+	addi	r3, r4, VCPU_TB_GUEST
+	bl	kvmhv_accumulate_time
+#endif
+
+	/* Enter guest */
+
+BEGIN_FTR_SECTION
+	ld	r5, VCPU_CFAR(r4)
+	mtspr	SPRN_CFAR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+	ld	r0, VCPU_PPR(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	ld	r5, VCPU_LR(r4)
+	mtlr	r5
+
+	ld	r1, VCPU_GPR(R1)(r4)
+	ld	r5, VCPU_GPR(R5)(r4)
+	ld	r8, VCPU_GPR(R8)(r4)
+	ld	r9, VCPU_GPR(R9)(r4)
+	ld	r10, VCPU_GPR(R10)(r4)
+	ld	r11, VCPU_GPR(R11)(r4)
+	ld	r12, VCPU_GPR(R12)(r4)
+	ld	r13, VCPU_GPR(R13)(r4)
+
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PPR, r0
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	ld	r6, VCPU_GPR(R6)(r4)
+	ld	r7, VCPU_GPR(R7)(r4)
+
+	ld	r0, VCPU_CR(r4)
+	mtcr	r0
+
+	ld	r0, VCPU_GPR(R0)(r4)
+	ld	r2, VCPU_GPR(R2)(r4)
+	ld	r3, VCPU_GPR(R3)(r4)
+	ld	r4, VCPU_GPR(R4)(r4)
+	HRFI_TO_GUEST
+	b	.
+SYM_CODE_END(kvmppc_hv_entry)
+
+secondary_too_late:
+	li	r12, 0
+	stw	r12, STACK_SLOT_TRAP(r1)
+	cmpdi	r4, 0
+	beq	11f
+	stw	r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+11:	b	kvmhv_switch_to_host
+
+no_switch_exit:
+	HMT_MEDIUM
+	li	r12, 0
+	b	12f
+hdec_soon:
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+12:	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+	b	guest_bypass
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+	.globl	kvmppc_interrupt_hv
+kvmppc_interrupt_hv:
+	/*
+	 * Register contents:
+	 * R9		= HSTATE_IN_GUEST
+	 * R12		= (guest CR << 32) | interrupt vector
+	 * R13		= PACA
+	 * guest R12 saved in shadow VCPU SCRATCH0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 * guest R9 saved in HSTATE_SCRATCH2
+	 */
+	/* We're now back in the host but in guest MMU context */
+	cmpwi	r9,KVM_GUEST_MODE_HOST_HV
+	beq	kvmppc_bad_host_intr
+	li	r9, KVM_GUEST_MODE_HOST_HV
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Save registers */
+
+	std	r0, VCPU_GPR(R0)(r9)
+	std	r1, VCPU_GPR(R1)(r9)
+	std	r2, VCPU_GPR(R2)(r9)
+	std	r3, VCPU_GPR(R3)(r9)
+	std	r4, VCPU_GPR(R4)(r9)
+	std	r5, VCPU_GPR(R5)(r9)
+	std	r6, VCPU_GPR(R6)(r9)
+	std	r7, VCPU_GPR(R7)(r9)
+	std	r8, VCPU_GPR(R8)(r9)
+	ld	r0, HSTATE_SCRATCH2(r13)
+	std	r0, VCPU_GPR(R9)(r9)
+	std	r10, VCPU_GPR(R10)(r9)
+	std	r11, VCPU_GPR(R11)(r9)
+	ld	r3, HSTATE_SCRATCH0(r13)
+	std	r3, VCPU_GPR(R12)(r9)
+	/* CR is in the high half of r12 */
+	srdi	r4, r12, 32
+	std	r4, VCPU_CR(r9)
+BEGIN_FTR_SECTION
+	ld	r3, HSTATE_CFAR(r13)
+	std	r3, VCPU_CFAR(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+	ld	r4, HSTATE_PPR(r13)
+	std	r4, VCPU_PPR(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	/* Restore R1/R2 so we can handle faults */
+	ld	r1, HSTATE_HOST_R1(r13)
+	LOAD_PACA_TOC()
+
+	mfspr	r10, SPRN_SRR0
+	mfspr	r11, SPRN_SRR1
+	std	r10, VCPU_SRR0(r9)
+	std	r11, VCPU_SRR1(r9)
+	/* trap is in the low half of r12, clear CR from the high half */
+	clrldi	r12, r12, 32
+	andi.	r0, r12, 2		/* need to read HSRR0/1? */
+	beq	1f
+	mfspr	r10, SPRN_HSRR0
+	mfspr	r11, SPRN_HSRR1
+	clrrdi	r12, r12, 2
+1:	std	r10, VCPU_PC(r9)
+	std	r11, VCPU_MSR(r9)
+
+	GET_SCRATCH0(r3)
+	mflr	r4
+	std	r3, VCPU_GPR(R13)(r9)
+	std	r4, VCPU_LR(r9)
+
+	stw	r12,VCPU_TRAP(r9)
+
+	/*
+	 * Now that we have saved away SRR0/1 and HSRR0/1,
+	 * interrupts are recoverable in principle, so set MSR_RI.
+	 * This becomes important for relocation-on interrupts from
+	 * the guest, which we can get in radix mode on POWER9.
+	 */
+	li	r0, MSR_RI
+	mtmsrd	r0, 1
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	addi	r3, r9, VCPU_TB_RMINTR
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+	ld	r5, VCPU_GPR(R5)(r9)
+	ld	r6, VCPU_GPR(R6)(r9)
+	ld	r7, VCPU_GPR(R7)(r9)
+	ld	r8, VCPU_GPR(R8)(r9)
+#endif
+
+	/* Save HEIR (HV emulation assist reg) in emul_inst
+	   if this is an HEI (HV emulation interrupt, e40) */
+	li	r3,KVM_INST_FETCH_FAILED
+	std	r3,VCPU_LAST_INST(r9)
+	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+11:	std	r3,VCPU_HEIR(r9)
+
+	/* these are volatile across C function calls */
+	mfctr	r3
+	mfxer	r4
+	std	r3, VCPU_CTR(r9)
+	std	r4, VCPU_XER(r9)
+
+	/* Save more register state  */
+	mfdar	r3
+	mfdsisr	r4
+	std	r3, VCPU_DAR(r9)
+	stw	r4, VCPU_DSISR(r9)
+
+	/* If this is a page table miss then see if it's theirs or ours */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	kvmppc_hdsi
+	std	r3, VCPU_FAULT_DAR(r9)
+	stw	r4, VCPU_FAULT_DSISR(r9)
+	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+	beq	kvmppc_hisi
+
+	/* See if this is a leftover HDEC interrupt */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	bne	2f
+	mfspr	r3,SPRN_HDEC
+	extsw	r3, r3
+	cmpdi	r3,0
+	mr	r4,r9
+	bge	fast_guest_return
+2:
+	/* See if this is an hcall we can handle in real mode */
+	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
+	beq	hcall_try_real_mode
+
+	/* Hypervisor doorbell - exit only if host IPI flag set */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	bne	3f
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	beq	maybe_reenter_guest
+	b	guest_exit_cont
+3:
+	/* If it's a hypervisor facility unavailable interrupt, save HFSCR */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL
+	bne	14f
+	mfspr	r3, SPRN_HFSCR
+	std	r3, VCPU_HFSCR(r9)
+	b	guest_exit_cont
+14:
+	/* External interrupt ? */
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	kvmppc_guest_external
+	/* See if it is a machine check */
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	machine_check_realmode
+	/* Or a hypervisor maintenance interrupt */
+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
+	beq	hmi_realmode
+
+guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+#endif
+
+	/*
+	 * Possibly flush the link stack here, before we do a blr in
+	 * kvmhv_switch_to_host.
+	 */
+1:	nop
+	patch_site 1b patch__call_kvm_flush_link_stack
+
+	/* For hash guest, read the guest SLB and save it away */
+	li	r5, 0
+	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
+	mtctr	r0
+	li	r6,0
+	addi	r7,r9,VCPU_SLB
+1:	slbmfee	r8,r6
+	andis.	r0,r8,SLB_ESID_V@h
+	beq	2f
+	add	r8,r8,r6		/* put index in */
+	slbmfev	r3,r6
+	std	r8,VCPU_SLB_E(r7)
+	std	r3,VCPU_SLB_V(r7)
+	addi	r7,r7,VCPU_SLB_SIZE
+	addi	r5,r5,1
+2:	addi	r6,r6,1
+	bdnz	1b
+	/* Finally clear out the SLB */
+	li	r0,0
+	slbmte	r0,r0
+	PPC_SLBIA(6)
+	ptesync
+	stw	r5,VCPU_SLB_MAX(r9)
+
+	/* load host SLB entries */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	li	r3, SLBSHADOW_SAVEAREA
+	LDX_BE	r5, r8, r3
+	addi	r3, r3, 8
+	LDX_BE	r6, r8, r3
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+
+guest_bypass:
+	stw	r12, STACK_SLOT_TRAP(r1)
+
+	/* Save DEC */
+	/* Do this before kvmhv_commence_exit so we know TB is guest TB */
+	ld	r3, HSTATE_KVM_VCORE(r13)
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+16:	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+	/* Increment exit count, poke other threads to exit */
+	mr 	r3, r12
+	bl	kvmhv_commence_exit
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Stop others sending VCPU interrupts to this physical CPU */
+	li	r0, -1
+	stw	r0, VCPU_CPU(r9)
+	stw	r0, VCPU_THREAD_CPU(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 if it was clear */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	li	r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/*
+	 * Save the guest PURR/SPURR
+	 */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	ld	r7,VCPU_PURR(r9)
+	ld	r8,VCPU_SPURR(r9)
+	std	r5,VCPU_PURR(r9)
+	std	r6,VCPU_SPURR(r9)
+	subf	r5,r7,r5
+	subf	r6,r8,r6
+
+	/*
+	 * Restore host PURR/SPURR and add guest times
+	 * so that the time in the guest gets accounted.
+	 */
+	ld	r3,HSTATE_PURR(r13)
+	ld	r4,HSTATE_SPURR(r13)
+	add	r3,r3,r5
+	add	r4,r4,r6
+	mtspr	SPRN_PURR,r3
+	mtspr	SPRN_SPURR,r4
+
+BEGIN_FTR_SECTION
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Save POWER8-specific registers */
+	mfspr	r5, SPRN_IAMR
+	mfspr	r6, SPRN_PSPB
+	mfspr	r7, SPRN_FSCR
+	std	r5, VCPU_IAMR(r9)
+	stw	r6, VCPU_PSPB(r9)
+	std	r7, VCPU_FSCR(r9)
+	mfspr	r5, SPRN_IC
+	mfspr	r7, SPRN_TAR
+	std	r5, VCPU_IC(r9)
+	std	r7, VCPU_TAR(r9)
+	mfspr	r8, SPRN_EBBHR
+	std	r8, VCPU_EBBHR(r9)
+	mfspr	r5, SPRN_EBBRR
+	mfspr	r6, SPRN_BESCR
+	mfspr	r7, SPRN_PID
+	mfspr	r8, SPRN_WORT
+	std	r5, VCPU_EBBRR(r9)
+	std	r6, VCPU_BESCR(r9)
+	stw	r7, VCPU_GUEST_PID(r9)
+	std	r8, VCPU_WORT(r9)
+	mfspr	r5, SPRN_TCSCR
+	mfspr	r6, SPRN_ACOP
+	mfspr	r7, SPRN_CSIGR
+	mfspr	r8, SPRN_TACR
+	std	r5, VCPU_TCSCR(r9)
+	std	r6, VCPU_ACOP(r9)
+	std	r7, VCPU_CSIGR(r9)
+	std	r8, VCPU_TACR(r9)
+BEGIN_FTR_SECTION
+	ld	r5, STACK_SLOT_FSCR(r1)
+	mtspr	SPRN_FSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	/*
+	 * Restore various registers to 0, where non-zero values
+	 * set by the guest could disrupt the host.
+	 */
+	li	r0, 0
+	mtspr	SPRN_PSPB, r0
+	mtspr	SPRN_WORT, r0
+	mtspr	SPRN_TCSCR, r0
+	/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+	li	r0, 1
+	sldi	r0, r0, 31
+	mtspr	SPRN_MMCRS, r0
+
+	/* Save and restore AMR, IAMR and UAMOR before turning on the MMU */
+	ld	r8, STACK_SLOT_IAMR(r1)
+	mtspr	SPRN_IAMR, r8
+
+8:	/* Power7 jumps back in here */
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	ld	r5,STACK_SLOT_AMR(r1)
+	ld	r6,STACK_SLOT_UAMOR(r1)
+	mtspr	SPRN_AMR, r5
+	mtspr	SPRN_UAMOR, r6
+
+	/* Switch DSCR back to host value */
+	mfspr	r8, SPRN_DSCR
+	ld	r7, HSTATE_DSCR(r13)
+	std	r8, VCPU_DSCR(r9)
+	mtspr	SPRN_DSCR, r7
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(R14)(r9)
+	std	r15, VCPU_GPR(R15)(r9)
+	std	r16, VCPU_GPR(R16)(r9)
+	std	r17, VCPU_GPR(R17)(r9)
+	std	r18, VCPU_GPR(R18)(r9)
+	std	r19, VCPU_GPR(R19)(r9)
+	std	r20, VCPU_GPR(R20)(r9)
+	std	r21, VCPU_GPR(R21)(r9)
+	std	r22, VCPU_GPR(R22)(r9)
+	std	r23, VCPU_GPR(R23)(r9)
+	std	r24, VCPU_GPR(R24)(r9)
+	std	r25, VCPU_GPR(R25)(r9)
+	std	r26, VCPU_GPR(R26)(r9)
+	std	r27, VCPU_GPR(R27)(r9)
+	std	r28, VCPU_GPR(R28)(r9)
+	std	r29, VCPU_GPR(R29)(r9)
+	std	r30, VCPU_GPR(R30)(r9)
+	std	r31, VCPU_GPR(R31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* save FP state */
+	mr	r3, r9
+	bl	kvmppc_save_fp
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	b	91f
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
+	 */
+	mr      r3, r9
+	ld      r4, VCPU_MSR(r3)
+	li	r5, 0			/* don't preserve non-vol regs */
+	bl	kvmppc_save_tm_hv
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+91:
+#endif
+
+	/* Increment yield count if they have a VPA */
+	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
+	cmpdi	r8, 0
+	beq	25f
+	li	r4, LPPACA_YIELDCOUNT
+	LWZX_BE	r3, r8, r4
+	addi	r3, r3, 1
+	STWX_BE	r3, r8, r4
+	li	r3, 1
+	stb	r3, VCPU_VPA_DIRTY(r9)
+25:
+	/* Save PMU registers if requested */
+	/* r8 and cr0.eq are live here */
+	mr	r3, r9
+	li	r4, 1
+	beq	21f			/* if no VPA, save PMU stuff anyway */
+	lbz	r4, LPPACA_PMCINUSE(r8)
+21:	bl	kvmhv_save_guest_pmu
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Restore host values of some registers */
+BEGIN_FTR_SECTION
+	ld	r5, STACK_SLOT_CIABR(r1)
+	ld	r6, STACK_SLOT_DAWR0(r1)
+	ld	r7, STACK_SLOT_DAWRX0(r1)
+	mtspr	SPRN_CIABR, r5
+	/*
+	 * If the DAWR doesn't work, it's ok to write these here as
+	 * this value should always be zero
+	*/
+	mtspr	SPRN_DAWR0, r6
+	mtspr	SPRN_DAWRX0, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	/*
+	 * POWER7/POWER8 guest -> host partition switch code.
+	 * We don't have to lock against tlbies but we do
+	 * have to coordinate the hardware threads.
+	 * Here STACK_SLOT_TRAP(r1) contains the trap number.
+	 */
+kvmhv_switch_to_host:
+	/* Secondary threads wait for primary to do partition switch */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r3,HSTATE_PTID(r13)
+	cmpwi	r3,0
+	beq	15f
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	b	16f
+
+	/* Primary thread waits for all the secondaries to exit guest */
+15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
+	rlwinm	r0,r3,32-8,0xff
+	clrldi	r3,r3,56
+	cmpw	r3,r0
+	bne	15b
+	isync
+
+	/* Did we actually switch to the guest at all? */
+	lbz	r6, VCORE_IN_GUEST(r5)
+	cmpwi	r6, 0
+	beq	19f
+
+	/* Primary thread switches back to host partition */
+	lwz	r7,KVM_HOST_LPID(r4)
+	ld	r6,KVM_HOST_SDR1(r4)
+	li	r8,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r8
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to host page table */
+	mtspr	SPRN_LPID,r7
+	isync
+
+BEGIN_FTR_SECTION
+	/* DPDES and VTB are shared between threads */
+	mfspr	r7, SPRN_DPDES
+	mfspr	r8, SPRN_VTB
+	std	r7, VCORE_DPDES(r5)
+	std	r8, VCORE_VTB(r5)
+	/* clear DPDES so we don't get guest doorbells in the host */
+	li	r8, 0
+	mtspr	SPRN_DPDES, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	/* Subtract timebase offset from timebase */
+	ld	r8, VCORE_TB_OFFSET_APPL(r5)
+	cmpdi	r8,0
+	beq	17f
+	li	r0, 0
+	std	r0, VCORE_TB_OFFSET_APPL(r5)
+	mftb	r6			/* current guest timebase */
+	subf	r8,r8,r6
+	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */
+	mftb	r7			/* check if lower 24 bits overflowed */
+	clrldi	r6,r6,40
+	clrldi	r7,r7,40
+	cmpld	r7,r6
+	bge	17f
+	addis	r8,r8,0x100		/* if so, increment upper 40 bits */
+	mtspr	SPRN_TBU40,r8
+
+17:
+	/*
+	 * If this is an HMI, we called kvmppc_realmode_hmi_handler
+	 * above, which may or may not have already called
+	 * kvmppc_subcore_exit_guest.  Fortunately, all that
+	 * kvmppc_subcore_exit_guest does is clear a flag, so calling
+	 * it again here is benign even if kvmppc_realmode_hmi_handler
+	 * has already called it.
+	 */
+	bl	kvmppc_subcore_exit_guest
+	nop
+30:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
+
+	/* Reset PCR */
+	ld	r0, VCORE_PCR(r5)
+	LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+	cmpld	r0, r6
+	beq	18f
+	mtspr	SPRN_PCR, r6
+18:
+	/* Signal secondary CPUs to continue */
+	li	r0, 0
+	stb	r0,VCORE_IN_GUEST(r5)
+19:	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+16:	ld	r8,KVM_HOST_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	/* Finish timing, if we have a vcpu */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	li	r3, 0
+	beq	2f
+	bl	kvmhv_accumulate_time
+2:
+#endif
+	/* Unset guest mode */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	lwz	r12, STACK_SLOT_TRAP(r1)	/* return trap # in r12 */
+	ld	r0, SFS+PPC_LR_STKOFF(r1)
+	addi	r1, r1, SFS
+	mtlr	r0
+	blr
+
+.balign 32
+.global kvm_flush_link_stack
+kvm_flush_link_stack:
+	/* Save LR into r0 */
+	mflr	r0
+
+	/* Flush the link stack. On Power8 it's up to 32 entries in size. */
+	.rept 32
+	ANNOTATE_INTRA_FUNCTION_CALL
+	bl	.+4
+	.endr
+
+	/* And on Power9 it's up to 64. */
+BEGIN_FTR_SECTION
+	.rept 32
+	ANNOTATE_INTRA_FUNCTION_CALL
+	bl	.+4
+	.endr
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
+	/* Restore LR */
+	mtlr	r0
+	blr
+
+kvmppc_guest_external:
+	/* External interrupt, first check for host_ipi. If this is
+	 * set, we know the host wants us out so let's do it now
+	 */
+	bl	CFUNC(kvmppc_read_intr)
+
+	/*
+	 * Restore the active volatile registers after returning from
+	 * a C function.
+	 */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+
+	/*
+	 * kvmppc_read_intr return codes:
+	 *
+	 * Exit to host (r3 > 0)
+	 *   1 An interrupt is pending that needs to be handled by the host
+	 *     Exit guest and return to host by branching to guest_exit_cont
+	 *
+	 *   2 Passthrough that needs completion in the host
+	 *     Exit guest and return to host by branching to guest_exit_cont
+	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+	 *     to indicate to the host to complete handling the interrupt
+	 *
+	 * Before returning to guest, we check if any CPU is heading out
+	 * to the host and if so, we head out also. If no CPUs are heading
+	 * check return values <= 0.
+	 *
+	 * Return to guest (r3 <= 0)
+	 *  0 No external interrupt is pending
+	 * -1 A guest wakeup IPI (which has now been cleared)
+	 *    In either case, we return to guest to deliver any pending
+	 *    guest interrupts.
+	 *
+	 * -2 A PCI passthrough external interrupt was handled
+	 *    (interrupt was delivered directly to guest)
+	 *    Return to guest to deliver any pending guest interrupts.
+	 */
+
+	cmpdi	r3, 1
+	ble	1f
+
+	/* Return code = 2 */
+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
+	stw	r12, VCPU_TRAP(r9)
+	b	guest_exit_cont
+
+1:	/* Return code <= 1 */
+	cmpdi	r3, 0
+	bgt	guest_exit_cont
+
+	/* Return code <= 0 */
+maybe_reenter_guest:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	mr	r4, r9
+	blt	deliver_guest_interrupt
+	b	guest_exit_cont
+
+/*
+ * Check whether an HDSI is an HPTE not found fault or something else.
+ * If it is an HPTE not found fault that is due to the guest accessing
+ * a page that they have mapped but which we have paged out, then
+ * we continue on with the guest exit path.  In all other cases,
+ * reflect the HDSI to the guest as a DSI.
+ */
+kvmppc_hdsi:
+	mfspr	r4, SPRN_HDAR
+	mfspr	r6, SPRN_HDSISR
+	/* HPTE not found fault or protection fault? */
+	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
+	beq	1f			/* if not, send it to the guest */
+	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
+	beq	3f
+	clrrdi	r0, r4, 28
+	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */
+	li	r0, BOOK3S_INTERRUPT_DATA_SEGMENT
+	bne	7f			/* if no SLB entry found */
+4:	std	r4, VCPU_FAULT_DAR(r9)
+	stw	r6, VCPU_FAULT_DSISR(r9)
+
+	/* Search the hash table. */
+	mr	r3, r9			/* vcpu pointer */
+	li	r7, 1			/* data fault */
+	bl	CFUNC(kvmppc_hpte_hv_fault)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	li	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+	cmpdi	r3, 0			/* retry the instruction */
+	beq	6f
+	cmpdi	r3, -1			/* handle in kernel mode */
+	beq	guest_exit_cont
+	cmpdi	r3, -2			/* MMIO emulation; need instr word */
+	beq	2f
+
+	/* Synthesize a DSI (or DSegI) for the guest */
+	ld	r4, VCPU_FAULT_DAR(r9)
+	mr	r6, r3
+1:	li	r0, BOOK3S_INTERRUPT_DATA_STORAGE
+	mtspr	SPRN_DSISR, r6
+7:	mtspr	SPRN_DAR, r4
+	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	mr	r10, r0
+	bl	kvmppc_msr_interrupt
+fast_interrupt_c_return:
+6:	ld	r7, VCPU_CTR(r9)
+	ld	r8, VCPU_XER(r9)
+	mtctr	r7
+	mtxer	r8
+	mr	r4, r9
+	b	fast_guest_return
+
+3:	ld	r5, VCPU_KVM(r9)	/* not relocated, use VRMA */
+	ld	r5, KVM_VRMA_SLB_V(r5)
+	b	4b
+
+	/* If this is for emulated MMIO, load the instruction word */
+2:	li	r8, KVM_INST_FETCH_FAILED	/* In case lwz faults */
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r0, KVM_GUEST_MODE_SKIP
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	/* Do the access with MSR:DR enabled */
+	mfmsr	r3
+	ori	r4, r3, MSR_DR		/* Enable paging for data */
+	mtmsrd	r4
+	lwz	r8, 0(r10)
+	mtmsrd	r3
+
+	/* Store the result */
+	std	r8, VCPU_LAST_INST(r9)
+
+	/* Unset guest mode. */
+	li	r0, KVM_GUEST_MODE_HOST_HV
+	stb	r0, HSTATE_IN_GUEST(r13)
+	b	guest_exit_cont
+
+/*
+ * Similarly for an HISI, reflect it to the guest as an ISI unless
+ * it is an HPTE not found fault for a page that we have paged out.
+ */
+kvmppc_hisi:
+	andis.	r0, r11, SRR1_ISI_NOPT@h
+	beq	1f
+	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
+	beq	3f
+	clrrdi	r0, r10, 28
+	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */
+	li	r0, BOOK3S_INTERRUPT_INST_SEGMENT
+	bne	7f			/* if no SLB entry found */
+4:
+	/* Search the hash table. */
+	mr	r3, r9			/* vcpu pointer */
+	mr	r4, r10
+	mr	r6, r11
+	li	r7, 0			/* instruction fault */
+	bl	CFUNC(kvmppc_hpte_hv_fault)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+	cmpdi	r3, 0			/* retry the instruction */
+	beq	fast_interrupt_c_return
+	cmpdi	r3, -1			/* handle in kernel mode */
+	beq	guest_exit_cont
+
+	/* Synthesize an ISI (or ISegI) for the guest */
+	mr	r11, r3
+1:	li	r0, BOOK3S_INTERRUPT_INST_STORAGE
+7:	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	mr	r10, r0
+	bl	kvmppc_msr_interrupt
+	b	fast_interrupt_c_return
+
+3:	ld	r6, VCPU_KVM(r9)	/* not relocated, use VRMA */
+	ld	r5, KVM_VRMA_SLB_V(r6)
+	b	4b
+
+/*
+ * Try to handle an hcall in real mode.
+ * Returns to the guest if we handle it, or continues on up to
+ * the kernel if we can't (i.e. if we don't have a handler for
+ * it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
+ */
+hcall_try_real_mode:
+	ld	r3,VCPU_GPR(R3)(r9)
+	andi.	r0,r11,MSR_PR
+	/* sc 1 from userspace - reflect to guest syscall */
+	bne	sc_1_fast_return
+	clrrdi	r3,r3,2
+	cmpldi	r3,hcall_real_table_end - hcall_real_table
+	bge	guest_exit_cont
+	/* See if this hcall is enabled for in-kernel handling */
+	ld	r4, VCPU_KVM(r9)
+	srdi	r0, r3, 8	/* r0 = (r3 / 4) >> 6 */
+	sldi	r0, r0, 3	/* index into kvm->arch.enabled_hcalls[] */
+	add	r4, r4, r0
+	ld	r0, KVM_ENABLED_HCALLS(r4)
+	rlwinm	r4, r3, 32-2, 0x3f	/* r4 = (r3 / 4) & 0x3f */
+	srd	r0, r0, r4
+	andi.	r0, r0, 1
+	beq	guest_exit_cont
+	/* Get pointer to handler, if any, and call it */
+	LOAD_REG_ADDR(r4, hcall_real_table)
+	lwax	r3,r3,r4
+	cmpwi	r3,0
+	beq	guest_exit_cont
+	add	r12,r3,r4
+	mtctr	r12
+	mr	r3,r9		/* get vcpu pointer */
+	ld	r4,VCPU_GPR(R4)(r9)
+	bctrl
+	cmpdi	r3,H_TOO_HARD
+	beq	hcall_real_fallback
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	std	r3,VCPU_GPR(R3)(r4)
+	ld	r10,VCPU_PC(r4)
+	ld	r11,VCPU_MSR(r4)
+	b	fast_guest_return
+
+sc_1_fast_return:
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10, BOOK3S_INTERRUPT_SYSCALL
+	bl	kvmppc_msr_interrupt
+	mr	r4,r9
+	b	fast_guest_return
+
+	/* We've attempted a real mode hcall, but it's punted it back
+	 * to userspace.  We need to restore some clobbered volatiles
+	 * before resuming the pass-it-to-qemu path */
+hcall_real_fallback:
+	li	r12,BOOK3S_INTERRUPT_SYSCALL
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	b	guest_exit_cont
+
+	.globl	hcall_real_table
+hcall_real_table:
+	.long	0		/* 0 - unused */
+	.long	DOTSYM(kvmppc_h_remove) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_enter) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_read) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_protect) - hcall_real_table
+	.long	0		/* 0x1c */
+	.long	0		/* 0x20 */
+	.long	0		/* 0x24 - H_SET_SPRG0 */
+	.long	DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table
+	.long	0		/* 0x30 */
+	.long	0		/* 0x34 */
+	.long	0		/* 0x38 */
+	.long	0		/* 0x3c */
+	.long	0		/* 0x40 */
+	.long	0		/* 0x44 */
+	.long	0		/* 0x48 */
+	.long	0		/* 0x4c */
+	.long	0		/* 0x50 */
+	.long	0		/* 0x54 */
+	.long	0		/* 0x58 */
+	.long	0		/* 0x5c */
+	.long	0		/* 0x60 */
+#ifdef CONFIG_KVM_XICS
+	.long	DOTSYM(xics_rm_h_eoi) - hcall_real_table
+	.long	DOTSYM(xics_rm_h_cppr) - hcall_real_table
+	.long	DOTSYM(xics_rm_h_ipi) - hcall_real_table
+	.long	0		/* 0x70 - H_IPOLL */
+	.long	DOTSYM(xics_rm_h_xirr) - hcall_real_table
+#else
+	.long	0		/* 0x64 - H_EOI */
+	.long	0		/* 0x68 - H_CPPR */
+	.long	0		/* 0x6c - H_IPI */
+	.long	0		/* 0x70 - H_IPOLL */
+	.long	0		/* 0x74 - H_XIRR */
+#endif
+	.long	0		/* 0x78 */
+	.long	0		/* 0x7c */
+	.long	0		/* 0x80 */
+	.long	0		/* 0x84 */
+	.long	0		/* 0x88 */
+	.long	0		/* 0x8c */
+	.long	0		/* 0x90 */
+	.long	0		/* 0x94 */
+	.long	0		/* 0x98 */
+	.long	0		/* 0x9c */
+	.long	0		/* 0xa0 */
+	.long	0		/* 0xa4 */
+	.long	0		/* 0xa8 */
+	.long	0		/* 0xac */
+	.long	0		/* 0xb0 */
+	.long	0		/* 0xb4 */
+	.long	0		/* 0xb8 */
+	.long	0		/* 0xbc */
+	.long	0		/* 0xc0 */
+	.long	0		/* 0xc4 */
+	.long	0		/* 0xc8 */
+	.long	0		/* 0xcc */
+	.long	0		/* 0xd0 */
+	.long	0		/* 0xd4 */
+	.long	0		/* 0xd8 */
+	.long	0		/* 0xdc */
+	.long	DOTSYM(kvmppc_h_cede) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_confer) - hcall_real_table
+	.long	0		/* 0xe8 */
+	.long	0		/* 0xec */
+	.long	0		/* 0xf0 */
+	.long	0		/* 0xf4 */
+	.long	0		/* 0xf8 */
+	.long	0		/* 0xfc */
+	.long	0		/* 0x100 */
+	.long	0		/* 0x104 */
+	.long	0		/* 0x108 */
+	.long	0		/* 0x10c */
+	.long	0		/* 0x110 */
+	.long	0		/* 0x114 */
+	.long	0		/* 0x118 */
+	.long	0		/* 0x11c */
+	.long	0		/* 0x120 */
+	.long	DOTSYM(kvmppc_h_bulk_remove) - hcall_real_table
+	.long	0		/* 0x128 */
+	.long	0		/* 0x12c */
+	.long	0		/* 0x130 */
+	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+	.long	0		/* 0x138 */
+	.long	0		/* 0x13c */
+	.long	0		/* 0x140 */
+	.long	0		/* 0x144 */
+	.long	0		/* 0x148 */
+	.long	0		/* 0x14c */
+	.long	0		/* 0x150 */
+	.long	0		/* 0x154 */
+	.long	0		/* 0x158 */
+	.long	0		/* 0x15c */
+	.long	0		/* 0x160 */
+	.long	0		/* 0x164 */
+	.long	0		/* 0x168 */
+	.long	0		/* 0x16c */
+	.long	0		/* 0x170 */
+	.long	0		/* 0x174 */
+	.long	0		/* 0x178 */
+	.long	0		/* 0x17c */
+	.long	0		/* 0x180 */
+	.long	0		/* 0x184 */
+	.long	0		/* 0x188 */
+	.long	0		/* 0x18c */
+	.long	0		/* 0x190 */
+	.long	0		/* 0x194 */
+	.long	0		/* 0x198 */
+	.long	0		/* 0x19c */
+	.long	0		/* 0x1a0 */
+	.long	0		/* 0x1a4 */
+	.long	0		/* 0x1a8 */
+	.long	0		/* 0x1ac */
+	.long	0		/* 0x1b0 */
+	.long	0		/* 0x1b4 */
+	.long	0		/* 0x1b8 */
+	.long	0		/* 0x1bc */
+	.long	0		/* 0x1c0 */
+	.long	0		/* 0x1c4 */
+	.long	0		/* 0x1c8 */
+	.long	0		/* 0x1cc */
+	.long	0		/* 0x1d0 */
+	.long	0		/* 0x1d4 */
+	.long	0		/* 0x1d8 */
+	.long	0		/* 0x1dc */
+	.long	0		/* 0x1e0 */
+	.long	0		/* 0x1e4 */
+	.long	0		/* 0x1e8 */
+	.long	0		/* 0x1ec */
+	.long	0		/* 0x1f0 */
+	.long	0		/* 0x1f4 */
+	.long	0		/* 0x1f8 */
+	.long	0		/* 0x1fc */
+	.long	0		/* 0x200 */
+	.long	0		/* 0x204 */
+	.long	0		/* 0x208 */
+	.long	0		/* 0x20c */
+	.long	0		/* 0x210 */
+	.long	0		/* 0x214 */
+	.long	0		/* 0x218 */
+	.long	0		/* 0x21c */
+	.long	0		/* 0x220 */
+	.long	0		/* 0x224 */
+	.long	0		/* 0x228 */
+	.long	0		/* 0x22c */
+	.long	0		/* 0x230 */
+	.long	0		/* 0x234 */
+	.long	0		/* 0x238 */
+	.long	0		/* 0x23c */
+	.long	0		/* 0x240 */
+	.long	0		/* 0x244 */
+	.long	0		/* 0x248 */
+	.long	0		/* 0x24c */
+	.long	0		/* 0x250 */
+	.long	0		/* 0x254 */
+	.long	0		/* 0x258 */
+	.long	0		/* 0x25c */
+	.long	0		/* 0x260 */
+	.long	0		/* 0x264 */
+	.long	0		/* 0x268 */
+	.long	0		/* 0x26c */
+	.long	0		/* 0x270 */
+	.long	0		/* 0x274 */
+	.long	0		/* 0x278 */
+	.long	0		/* 0x27c */
+	.long	0		/* 0x280 */
+	.long	0		/* 0x284 */
+	.long	0		/* 0x288 */
+	.long	0		/* 0x28c */
+	.long	0		/* 0x290 */
+	.long	0		/* 0x294 */
+	.long	0		/* 0x298 */
+	.long	0		/* 0x29c */
+	.long	0		/* 0x2a0 */
+	.long	0		/* 0x2a4 */
+	.long	0		/* 0x2a8 */
+	.long	0		/* 0x2ac */
+	.long	0		/* 0x2b0 */
+	.long	0		/* 0x2b4 */
+	.long	0		/* 0x2b8 */
+	.long	0		/* 0x2bc */
+	.long	0		/* 0x2c0 */
+	.long	0		/* 0x2c4 */
+	.long	0		/* 0x2c8 */
+	.long	0		/* 0x2cc */
+	.long	0		/* 0x2d0 */
+	.long	0		/* 0x2d4 */
+	.long	0		/* 0x2d8 */
+	.long	0		/* 0x2dc */
+	.long	0		/* 0x2e0 */
+	.long	0		/* 0x2e4 */
+	.long	0		/* 0x2e8 */
+	.long	0		/* 0x2ec */
+	.long	0		/* 0x2f0 */
+	.long	0		/* 0x2f4 */
+	.long	0		/* 0x2f8 */
+#ifdef CONFIG_KVM_XICS
+	.long	DOTSYM(xics_rm_h_xirr_x) - hcall_real_table
+#else
+	.long	0		/* 0x2fc - H_XIRR_X*/
+#endif
+	.long	DOTSYM(kvmppc_rm_h_random) - hcall_real_table
+	.globl	hcall_real_table_end
+hcall_real_table_end:
+
+_GLOBAL_TOC(kvmppc_h_set_xdabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
+	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
+	beq	6f
+	li	r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
+	andc.	r0, r5, r0
+	beq	3f
+6:	li	r3, H_PARAMETER
+	blr
+
+_GLOBAL_TOC(kvmppc_h_set_dabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
+	li	r5, DABRX_USER | DABRX_KERNEL
+3:
+BEGIN_FTR_SECTION
+	b	2f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	std	r4,VCPU_DABR(r3)
+	stw	r5, VCPU_DABRX(r3)
+	mtspr	SPRN_DABRX, r5
+	/* Work around P7 bug where DABR can get corrupted on mtspr */
+1:	mtspr	SPRN_DABR,r4
+	mfspr	r5, SPRN_DABR
+	cmpd	r4, r5
+	bne	1b
+	isync
+	li	r3,0
+	blr
+
+2:
+	LOAD_REG_ADDR(r11, dawr_force_enable)
+	lbz	r11, 0(r11)
+	cmpdi	r11, 0
+	bne	3f
+	li	r3, H_HARDWARE
+	blr
+3:
+	/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
+	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW
+	rlwimi	r5, r4, 2, DAWRX_WT
+	clrrdi	r4, r4, 3
+	std	r4, VCPU_DAWR0(r3)
+	std	r5, VCPU_DAWRX0(r3)
+	/*
+	 * If came in through the real mode hcall handler then it is necessary
+	 * to write the registers since the return path won't. Otherwise it is
+	 * sufficient to store then in the vcpu struct as they will be loaded
+	 * next time the vcpu is run.
+	 */
+	mfmsr	r6
+	andi.	r6, r6, MSR_DR		/* in real mode? */
+	bne	4f
+	mtspr	SPRN_DAWR0, r4
+	mtspr	SPRN_DAWRX0, r5
+4:	li	r3, 0
+	blr
+
+_GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
+	ori	r11,r11,MSR_EE
+	std	r11,VCPU_MSR(r3)
+	li	r0,1
+	stb	r0,VCPU_CEDED(r3)
+	sync			/* order setting ceded vs. testing prodded */
+	lbz	r5,VCPU_PRODDED(r3)
+	cmpwi	r5,0
+	bne	kvm_cede_prodded
+	li	r12,0		/* set trap to 0 to say hcall is handled */
+	stw	r12,VCPU_TRAP(r3)
+	li	r0,H_SUCCESS
+	std	r0,VCPU_GPR(R3)(r3)
+
+	/*
+	 * Set our bit in the bitmask of napping threads unless all the
+	 * other threads are already napping, in which case we send this
+	 * up to the host.
+	 */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	lbz	r6,HSTATE_PTID(r13)
+	lwz	r8,VCORE_ENTRY_EXIT(r5)
+	clrldi	r8,r8,56
+	li	r0,1
+	sld	r0,r0,r6
+	addi	r6,r5,VCORE_NAPPING_THREADS
+31:	lwarx	r4,0,r6
+	or	r4,r4,r0
+	cmpw	r4,r8
+	beq	kvm_cede_exit
+	stwcx.	r4,0,r6
+	bne	31b
+	/* order napping_threads update vs testing entry_exit_map */
+	isync
+	li	r0,NAPPING_CEDE
+	stb	r0,HSTATE_NAPPING(r13)
+	lwz	r7,VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7,0x100
+	bge	33f		/* another thread already exiting */
+
+/*
+ * Although not specifically required by the architecture, POWER7
+ * preserves the following registers in nap mode, even if an SMT mode
+ * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
+ * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
+ */
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(R14)(r3)
+	std	r15, VCPU_GPR(R15)(r3)
+	std	r16, VCPU_GPR(R16)(r3)
+	std	r17, VCPU_GPR(R17)(r3)
+	std	r18, VCPU_GPR(R18)(r3)
+	std	r19, VCPU_GPR(R19)(r3)
+	std	r20, VCPU_GPR(R20)(r3)
+	std	r21, VCPU_GPR(R21)(r3)
+	std	r22, VCPU_GPR(R22)(r3)
+	std	r23, VCPU_GPR(R23)(r3)
+	std	r24, VCPU_GPR(R24)(r3)
+	std	r25, VCPU_GPR(R25)(r3)
+	std	r26, VCPU_GPR(R26)(r3)
+	std	r27, VCPU_GPR(R27)(r3)
+	std	r28, VCPU_GPR(R28)(r3)
+	std	r29, VCPU_GPR(R29)(r3)
+	std	r30, VCPU_GPR(R30)(r3)
+	std	r31, VCPU_GPR(R31)(r3)
+
+	/* save FP state */
+	bl	kvmppc_save_fp
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	b	91f
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
+	 */
+	ld	r3, HSTATE_KVM_VCPU(r13)
+	ld      r4, VCPU_MSR(r3)
+	li	r5, 0			/* don't preserve non-vol regs */
+	bl	kvmppc_save_tm_hv
+	nop
+91:
+#endif
+
+	/*
+	 * Set DEC to the smaller of DEC and HDEC, so that we wake
+	 * no later than the end of our timeslice (HDEC interrupts
+	 * don't wake us from nap).
+	 */
+	mfspr	r3, SPRN_DEC
+	mfspr	r4, SPRN_HDEC
+	mftb	r5
+	extsw	r3, r3
+	extsw	r4, r4
+	cmpd	r3, r4
+	ble	67f
+	mtspr	SPRN_DEC, r4
+67:
+	/* save expiry time of guest decrementer */
+	add	r3, r3, r5
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	std	r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	addi	r3, r4, VCPU_TB_CEDE
+	bl	kvmhv_accumulate_time
+#endif
+
+	lis	r3, LPCR_PECEDP@h	/* Do wake on privileged doorbell */
+
+	/* Go back to host stack */
+	ld	r1, HSTATE_HOST_R1(r13)
+
+	/*
+	 * Take a nap until a decrementer or external or doobell interrupt
+	 * occurs, with PECE1 and PECE0 set in LPCR.
+	 * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+	 * Also clear the runlatch bit before napping.
+	 */
+kvm_do_nap:
+	li	r0,0
+	mtspr	SPRN_CTRLT, r0
+
+	li	r0,1
+	stb	r0,HSTATE_HWTHREAD_REQ(r13)
+	mfspr	r5,SPRN_LPCR
+	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
+BEGIN_FTR_SECTION
+	ori	r5, r5, LPCR_PECEDH
+	rlwimi	r5, r3, 0, LPCR_PECEDP
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+kvm_nap_sequence:		/* desired LPCR value in r5 */
+	li	r3, PNV_THREAD_NAP
+	mtspr	SPRN_LPCR,r5
+	isync
+
+	bl	isa206_idle_insn_mayloss
+
+	li	r0,1
+	mtspr	SPRN_CTRLT, r0
+
+	mtspr	SPRN_SRR1, r3
+
+	li	r0, 0
+	stb	r0, PACA_FTRACE_ENABLED(r13)
+
+	li	r0, KVM_HWTHREAD_IN_KVM
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+
+	lbz	r0, HSTATE_NAPPING(r13)
+	cmpwi	r0, NAPPING_CEDE
+	beq	kvm_end_cede
+	cmpwi	r0, NAPPING_NOVCPU
+	beq	kvm_novcpu_wakeup
+	cmpwi	r0, NAPPING_UNSPLIT
+	beq	kvm_unsplit_wakeup
+	twi	31,0,0 /* Nap state must not be zero */
+
+33:	mr	r4, r3
+	li	r3, 0
+	li	r12, 0
+	b	34f
+
+kvm_end_cede:
+	/* Woken by external or decrementer interrupt */
+
+	/* get vcpu pointer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+	addi	r3, r4, VCPU_TB_RMINTR
+	bl	kvmhv_accumulate_time
+#endif
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	b	91f
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
+	 */
+	mr      r3, r4
+	ld      r4, VCPU_MSR(r3)
+	li	r5, 0			/* don't preserve non-vol regs */
+	bl	kvmppc_restore_tm_hv
+	nop
+	ld	r4, HSTATE_KVM_VCPU(r13)
+91:
+#endif
+
+	/* load up FP state */
+	bl	kvmppc_load_fp
+
+	/* Restore guest decrementer */
+	ld	r3, VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3, r7, r3
+	mtspr	SPRN_DEC, r3
+
+	/* Load NV GPRS */
+	ld	r14, VCPU_GPR(R14)(r4)
+	ld	r15, VCPU_GPR(R15)(r4)
+	ld	r16, VCPU_GPR(R16)(r4)
+	ld	r17, VCPU_GPR(R17)(r4)
+	ld	r18, VCPU_GPR(R18)(r4)
+	ld	r19, VCPU_GPR(R19)(r4)
+	ld	r20, VCPU_GPR(R20)(r4)
+	ld	r21, VCPU_GPR(R21)(r4)
+	ld	r22, VCPU_GPR(R22)(r4)
+	ld	r23, VCPU_GPR(R23)(r4)
+	ld	r24, VCPU_GPR(R24)(r4)
+	ld	r25, VCPU_GPR(R25)(r4)
+	ld	r26, VCPU_GPR(R26)(r4)
+	ld	r27, VCPU_GPR(R27)(r4)
+	ld	r28, VCPU_GPR(R28)(r4)
+	ld	r29, VCPU_GPR(R29)(r4)
+	ld	r30, VCPU_GPR(R30)(r4)
+	ld	r31, VCPU_GPR(R31)(r4)
+
+	/* Check the wake reason in SRR1 to see why we got here */
+	bl	kvmppc_check_wake_reason
+
+	/*
+	 * Restore volatile registers since we could have called a
+	 * C routine in kvmppc_check_wake_reason
+	 *	r4 = VCPU
+	 * r3 tells us whether we need to return to host or not
+	 * WARNING: it gets checked further down:
+	 * should not modify r3 until this check is done.
+	 */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
+	/* clear our bit in vcore->napping_threads */
+34:	ld	r5,HSTATE_KVM_VCORE(r13)
+	lbz	r7,HSTATE_PTID(r13)
+	li	r0,1
+	sld	r0,r0,r7
+	addi	r6,r5,VCORE_NAPPING_THREADS
+32:	lwarx	r7,0,r6
+	andc	r7,r7,r0
+	stwcx.	r7,0,r6
+	bne	32b
+	li	r0,0
+	stb	r0,HSTATE_NAPPING(r13)
+
+	/* See if the wake reason saved in r3 means we need to exit */
+	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
+	cmpdi	r3, 0
+	bgt	guest_exit_cont
+	b	maybe_reenter_guest
+
+	/* cede when already previously prodded case */
+kvm_cede_prodded:
+	li	r0,0
+	stb	r0,VCPU_PRODDED(r3)
+	sync			/* order testing prodded vs. clearing ceded */
+	stb	r0,VCPU_CEDED(r3)
+	li	r3,H_SUCCESS
+	blr
+
+	/* we've ceded but we want to give control to the host */
+kvm_cede_exit:
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	b	guest_exit_cont
+
+	/* Try to do machine check recovery in real mode */
+machine_check_realmode:
+	mr	r3, r9		/* get vcpu pointer */
+	bl	kvmppc_realmode_machine_check
+	nop
+	/* all machine checks go to virtual mode for further handling */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	b	guest_exit_cont
+
+/*
+ * Call C code to handle a HMI in real mode.
+ * Only the primary thread does the call, secondary threads are handled
+ * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
+ * r9 points to the vcpu on entry
+ */
+hmi_realmode:
+	lbz	r0, HSTATE_PTID(r13)
+	cmpwi	r0, 0
+	bne	guest_exit_cont
+	bl	CFUNC(kvmppc_realmode_hmi_handler)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_HMI
+	b	guest_exit_cont
+
+/*
+ * Check the reason we woke from nap, and take appropriate action.
+ * Returns (in r3):
+ *	0 if nothing needs to be done
+ *	1 if something happened that needs to be handled by the host
+ *	-1 if there was a guest wakeup (IPI or msgsnd)
+ *	-2 if we handled a PCI passthrough interrupt (returned by
+ *		kvmppc_read_intr only)
+ *
+ * Also sets r12 to the interrupt vector for any interrupt that needs
+ * to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
+ */
+SYM_FUNC_START_LOCAL(kvmppc_check_wake_reason)
+	mfspr	r6, SPRN_SRR1
+BEGIN_FTR_SECTION
+	rlwinm	r6, r6, 45-31, 0xf	/* extract wake reason field (P8) */
+FTR_SECTION_ELSE
+	rlwinm	r6, r6, 45-31, 0xe	/* P7 wake reason field is 3 bits */
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
+	cmpwi	r6, 8			/* was it an external interrupt? */
+	beq	7f			/* if so, see what it was */
+	li	r3, 0
+	li	r12, 0
+	cmpwi	r6, 6			/* was it the decrementer? */
+	beq	0f
+BEGIN_FTR_SECTION
+	cmpwi	r6, 5			/* privileged doorbell? */
+	beq	0f
+	cmpwi	r6, 3			/* hypervisor doorbell? */
+	beq	3f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	cmpwi	r6, 0xa			/* Hypervisor maintenance ? */
+	beq	4f
+	li	r3, 1			/* anything else, return 1 */
+0:	blr
+
+	/* hypervisor doorbell */
+3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
+
+	/*
+	 * Clear the doorbell as we will invoke the handler
+	 * explicitly in the guest exit path.
+	 */
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
+	/* see if it's a host IPI */
+	li	r3, 1
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bnelr
+	/* if not, return -1 */
+	li	r3, -1
+	blr
+
+	/* Woken up due to Hypervisor maintenance interrupt */
+4:	li	r12, BOOK3S_INTERRUPT_HMI
+	li	r3, 1
+	blr
+
+	/* external interrupt - create a stack frame so we can call C */
+7:	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -PPC_MIN_STKFRM(r1)
+	bl	CFUNC(kvmppc_read_intr)
+	nop
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	cmpdi	r3, 1
+	ble	1f
+
+	/*
+	 * Return code of 2 means PCI passthrough interrupt, but
+	 * we need to return back to host to complete handling the
+	 * interrupt. Trap reason is expected in r12 by guest
+	 * exit code.
+	 */
+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
+	ld	r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+	addi	r1, r1, PPC_MIN_STKFRM
+	mtlr	r0
+	blr
+SYM_FUNC_END(kvmppc_check_wake_reason)
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+ * N.B. r30 and r31 are volatile across this function,
+ * thus it is not callable from C.
+ */
+SYM_FUNC_START_LOCAL(kvmppc_save_fp)
+	mflr	r30
+	mr	r31,r3
+	mfmsr	r5
+	ori	r8,r5,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	addi	r3,r3,VCPU_FPRS
+	bl	store_fp_state
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	addi	r3,r31,VCPU_VRS
+	bl	store_vr_state
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	mfspr	r6,SPRN_VRSAVE
+	stw	r6,VCPU_VRSAVE(r31)
+	mtlr	r30
+	blr
+SYM_FUNC_END(kvmppc_save_fp)
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ * N.B. r30 and r31 are volatile across this function,
+ * thus it is not callable from C.
+ */
+SYM_FUNC_START_LOCAL(kvmppc_load_fp)
+	mflr	r30
+	mr	r31,r4
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	addi	r3,r4,VCPU_FPRS
+	bl	load_fp_state
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	addi	r3,r31,VCPU_VRS
+	bl	load_vr_state
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	lwz	r7,VCPU_VRSAVE(r31)
+	mtspr	SPRN_VRSAVE,r7
+	mtlr	r30
+	mr	r4,r31
+	blr
+SYM_FUNC_END(kvmppc_load_fp)
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r3 pointing to the vcpu struct and r4 containing
+ * the guest MSR value.
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
+ * If r5 == 0, this can modify all checkpointed registers, but
+ * restores r1 and r2 before exit.
+ */
+_GLOBAL_TOC(kvmppc_save_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
+	/* See if we need to handle fake suspend mode */
+BEGIN_FTR_SECTION
+	b	__kvmppc_save_tm
+END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST)
+
+	lbz	r0, HSTATE_FAKE_SUSPEND(r13) /* Were we fake suspended? */
+	cmpwi	r0, 0
+	beq	__kvmppc_save_tm
+
+	/* The following code handles the fake_suspend = 1 case */
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -TM_FRAME_SIZE(r1)
+
+	/* Turn on TM. */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	rldicl. r8, r8, 64 - MSR_TS_S_LG, 62 /* Did we actually hrfid? */
+	beq	4f
+BEGIN_FTR_SECTION
+	bl	pnv_power9_force_smt4_catch
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
+	nop
+
+	/*
+	 * It's possible that treclaim. may modify registers, if we have lost
+	 * track of fake-suspend state in the guest due to it using rfscv.
+	 * Save and restore registers in case this occurs.
+	 */
+	mfspr	r3, SPRN_DSCR
+	mfspr	r4, SPRN_XER
+	mfspr	r5, SPRN_AMR
+	/* SPRN_TAR would need to be saved here if the kernel ever used it */
+	mfcr	r12
+	SAVE_NVGPRS(r1)
+	SAVE_GPR(2, r1)
+	SAVE_GPR(3, r1)
+	SAVE_GPR(4, r1)
+	SAVE_GPR(5, r1)
+	stw	r12, 8(r1)
+	std	r1, HSTATE_HOST_R1(r13)
+
+	/* We have to treclaim here because that's the only way to do S->N */
+	li	r3, TM_CAUSE_KVM_RESCHED
+	TRECLAIM(R3)
+
+	GET_PACA(r13)
+	ld	r1, HSTATE_HOST_R1(r13)
+	REST_GPR(2, r1)
+	REST_GPR(3, r1)
+	REST_GPR(4, r1)
+	REST_GPR(5, r1)
+	lwz	r12, 8(r1)
+	REST_NVGPRS(r1)
+	mtspr	SPRN_DSCR, r3
+	mtspr	SPRN_XER, r4
+	mtspr	SPRN_AMR, r5
+	mtcr	r12
+	HMT_MEDIUM
+
+	/*
+	 * We were in fake suspend, so we are not going to save the
+	 * register state as the guest checkpointed state (since
+	 * we already have it), therefore we can now use any volatile GPR.
+	 * In fact treclaim in fake suspend state doesn't modify
+	 * any registers.
+	 */
+
+BEGIN_FTR_SECTION
+	bl	pnv_power9_force_smt4_release
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
+	nop
+
+4:
+	mfspr	r3, SPRN_PSSCR
+	/* PSSCR_FAKE_SUSPEND is a write-only bit, but clear it anyway */
+	li	r0, PSSCR_FAKE_SUSPEND
+	andc	r3, r3, r0
+	mtspr	SPRN_PSSCR, r3
+
+	/* Don't save TEXASR, use value from last exit in real suspend state */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+
+	addi	r1, r1, TM_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r3 pointing to the vcpu struct
+ * and r4 containing the guest MSR value.
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1 and r2 from the PACA.
+ */
+_GLOBAL_TOC(kvmppc_restore_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
+	/*
+	 * If we are doing TM emulation for the guest on a POWER9 DD2,
+	 * then we don't actually do a trechkpt -- we either set up
+	 * fake-suspend mode, or emulate a TM rollback.
+	 */
+BEGIN_FTR_SECTION
+	b	__kvmppc_restore_tm
+END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+
+	li	r0, 0
+	stb	r0, HSTATE_FAKE_SUSPEND(r13)
+
+	/* Turn on TM so we can restore TM SPRs */
+	mfmsr	r5
+	li	r0, 1
+	rldimi	r5, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r5
+
+	/*
+	 * The user may change these outside of a transaction, so they must
+	 * always be context switched.
+	 */
+	ld	r5, VCPU_TFHAR(r3)
+	ld	r6, VCPU_TFIAR(r3)
+	ld	r7, VCPU_TEXASR(r3)
+	mtspr	SPRN_TFHAR, r5
+	mtspr	SPRN_TFIAR, r6
+	mtspr	SPRN_TEXASR, r7
+
+	rldicl. r5, r4, 64 - MSR_TS_S_LG, 62
+	beqlr		/* TM not active in guest */
+
+	/* Make sure the failure summary is set */
+	oris	r7, r7, (TEXASR_FS)@h
+	mtspr	SPRN_TEXASR, r7
+
+	cmpwi	r5, 1		/* check for suspended state */
+	bgt	10f
+	stb	r5, HSTATE_FAKE_SUSPEND(r13)
+	b	9f		/* and return */
+10:	stdu	r1, -PPC_MIN_STKFRM(r1)
+	/* guest is in transactional state, so simulate rollback */
+	bl	kvmhv_emulate_tm_rollback
+	nop
+	addi	r1, r1, PPC_MIN_STKFRM
+9:	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+/*
+ * We come here if we get any exception or interrupt while we are
+ * executing host real mode code while in guest MMU context.
+ * r12 is (CR << 32) | vector
+ * r13 points to our PACA
+ * r12 is saved in HSTATE_SCRATCH0(r13)
+ * r9 is saved in HSTATE_SCRATCH2(r13)
+ * r13 is saved in HSPRG1
+ * cfar is saved in HSTATE_CFAR(r13)
+ * ppr is saved in HSTATE_PPR(r13)
+ */
+kvmppc_bad_host_intr:
+	/*
+	 * Switch to the emergency stack, but start half-way down in
+	 * case we were already on it.
+	 */
+	mr	r9, r1
+	std	r1, PACAR1(r13)
+	ld	r1, PACAEMERGSP(r13)
+	subi	r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE
+	std	r9, 0(r1)
+	std	r0, GPR0(r1)
+	std	r9, GPR1(r1)
+	std	r2, GPR2(r1)
+	SAVE_GPRS(3, 8, r1)
+	srdi	r0, r12, 32
+	clrldi	r12, r12, 32
+	std	r0, _CCR(r1)
+	std	r12, _TRAP(r1)
+	andi.	r0, r12, 2
+	beq	1f
+	mfspr	r3, SPRN_HSRR0
+	mfspr	r4, SPRN_HSRR1
+	mfspr	r5, SPRN_HDAR
+	mfspr	r6, SPRN_HDSISR
+	b	2f
+1:	mfspr	r3, SPRN_SRR0
+	mfspr	r4, SPRN_SRR1
+	mfspr	r5, SPRN_DAR
+	mfspr	r6, SPRN_DSISR
+2:	std	r3, _NIP(r1)
+	std	r4, _MSR(r1)
+	std	r5, _DAR(r1)
+	std	r6, _DSISR(r1)
+	ld	r9, HSTATE_SCRATCH2(r13)
+	ld	r12, HSTATE_SCRATCH0(r13)
+	GET_SCRATCH0(r0)
+	SAVE_GPRS(9, 12, r1)
+	std	r0, GPR13(r1)
+	SAVE_NVGPRS(r1)
+	ld	r5, HSTATE_CFAR(r13)
+	std	r5, ORIG_GPR3(r1)
+	mflr	r3
+	mfctr	r4
+	mfxer	r5
+	lbz	r6, PACAIRQSOFTMASK(r13)
+	std	r3, _LINK(r1)
+	std	r4, _CTR(r1)
+	std	r5, _XER(r1)
+	std	r6, SOFTE(r1)
+	LOAD_PACA_TOC()
+	LOAD_REG_IMMEDIATE(3, STACK_FRAME_REGS_MARKER)
+	std	r3, STACK_INT_FRAME_MARKER(r1)
+
+	/*
+	 * XXX On POWER7 and POWER8, we just spin here since we don't
+	 * know what the other threads are doing (and we don't want to
+	 * coordinate with them) - but at least we now have register state
+	 * in memory that we might be able to look at from another CPU.
+	 */
+	b	.
+
+/*
+ * This mimics the MSR transition on IRQ delivery.  The new guest MSR is taken
+ * from VCPU_INTR_MSR and is modified based on the required TM state changes.
+ *   r11 has the guest MSR value (in/out)
+ *   r9 has a vcpu pointer (in)
+ *   r0 is used as a scratch register
+ */
+SYM_FUNC_START_LOCAL(kvmppc_msr_interrupt)
+	rldicl	r0, r11, 64 - MSR_TS_S_LG, 62
+	cmpwi	r0, 2 /* Check if we are in transactional state..  */
+	ld	r11, VCPU_INTR_MSR(r9)
+	bne	1f
+	/* ... if transactional, change to suspended */
+	li	r0, 1
+1:	rldimi	r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+	blr
+SYM_FUNC_END(kvmppc_msr_interrupt)
+
+/*
+ * void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu)
+ *
+ * Load up guest PMU state.  R3 points to the vcpu struct.
+ */
+SYM_FUNC_START_LOCAL(kvmhv_load_guest_pmu)
+	mr	r4, r3
+	mflr	r0
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+BEGIN_FTR_SECTION
+	ld	r3, VCPU_MMCR(r4)
+	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+	cmpwi	r5, MMCR0_PMAO
+	beql	kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCRA(r4)
+	ld	r7, VCPU_SIAR(r4)
+	ld	r8, VCPU_SDAR(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_SIAR, r7
+	mtspr	SPRN_SDAR, r8
+BEGIN_FTR_SECTION
+	ld	r5, VCPU_MMCR + 16(r4)
+	ld	r6, VCPU_SIER(r4)
+	mtspr	SPRN_MMCR2, r5
+	mtspr	SPRN_SIER, r6
+	lwz	r7, VCPU_PMC + 24(r4)
+	lwz	r8, VCPU_PMC + 28(r4)
+	ld	r9, VCPU_MMCRS(r4)
+	mtspr	SPRN_SPMC1, r7
+	mtspr	SPRN_SPMC2, r8
+	mtspr	SPRN_MMCRS, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mtspr	SPRN_MMCR0, r3
+	isync
+	mtlr	r0
+	blr
+SYM_FUNC_END(kvmhv_load_guest_pmu)
+
+/*
+ * void kvmhv_load_host_pmu(void)
+ *
+ * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
+ */
+SYM_FUNC_START_LOCAL(kvmhv_load_host_pmu)
+	mflr	r0
+	lbz	r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
+	cmpwi	r4, 0
+	beq	23f			/* skip if not */
+BEGIN_FTR_SECTION
+	ld	r3, HSTATE_MMCR0(r13)
+	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+	cmpwi	r4, MMCR0_PMAO
+	beql	kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+	lwz	r3, HSTATE_PMC1(r13)
+	lwz	r4, HSTATE_PMC2(r13)
+	lwz	r5, HSTATE_PMC3(r13)
+	lwz	r6, HSTATE_PMC4(r13)
+	lwz	r8, HSTATE_PMC5(r13)
+	lwz	r9, HSTATE_PMC6(r13)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r4
+	mtspr	SPRN_PMC3, r5
+	mtspr	SPRN_PMC4, r6
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, HSTATE_MMCR0(r13)
+	ld	r4, HSTATE_MMCR1(r13)
+	ld	r5, HSTATE_MMCRA(r13)
+	ld	r6, HSTATE_SIAR(r13)
+	ld	r7, HSTATE_SDAR(r13)
+	mtspr	SPRN_MMCR1, r4
+	mtspr	SPRN_MMCRA, r5
+	mtspr	SPRN_SIAR, r6
+	mtspr	SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+	ld	r8, HSTATE_MMCR2(r13)
+	ld	r9, HSTATE_SIER(r13)
+	mtspr	SPRN_MMCR2, r8
+	mtspr	SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mtspr	SPRN_MMCR0, r3
+	isync
+	mtlr	r0
+23:	blr
+SYM_FUNC_END(kvmhv_load_host_pmu)
+
+/*
+ * void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use)
+ *
+ * Save guest PMU state into the vcpu struct.
+ * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
+ */
+SYM_FUNC_START_LOCAL(kvmhv_save_guest_pmu)
+	mr	r9, r3
+	mr	r8, r4
+BEGIN_FTR_SECTION
+	/*
+	 * POWER8 seems to have a hardware bug where setting
+	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
+	 * when some counters are already negative doesn't seem
+	 * to cause a performance monitor alert (and hence interrupt).
+	 * The effect of this is that when saving the PMU state,
+	 * if there is no PMU alert pending when we read MMCR0
+	 * before freezing the counters, but one becomes pending
+	 * before we read the counters, we lose it.
+	 * To work around this, we need a way to freeze the counters
+	 * before reading MMCR0.  Normally, freezing the counters
+	 * is done by writing MMCR0 (to set MMCR0[FC]) which
+	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
+	 * we can also freeze the counters using MMCR2, by writing
+	 * 1s to all the counter freeze condition bits (there are
+	 * 9 bits each for 6 counters).
+	 */
+	li	r3, -1			/* set all freeze bits */
+	clrrdi	r3, r3, 10
+	mfspr	r10, SPRN_MMCR2
+	mtspr	SPRN_MMCR2, r3
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	mfspr	r6, SPRN_MMCRA
+	/* Clear MMCRA in order to disable SDAR updates */
+	li	r7, 0
+	mtspr	SPRN_MMCRA, r7
+	isync
+	cmpwi	r8, 0			/* did they ask for PMU stuff to be saved? */
+	bne	21f
+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
+	b	22f
+21:	mfspr	r5, SPRN_MMCR1
+	mfspr	r7, SPRN_SIAR
+	mfspr	r8, SPRN_SDAR
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCRA(r9)
+BEGIN_FTR_SECTION
+	std	r10, VCPU_MMCR + 16(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	std	r7, VCPU_SIAR(r9)
+	std	r8, VCPU_SDAR(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_SIER
+	std	r5, VCPU_SIER(r9)
+	mfspr	r6, SPRN_SPMC1
+	mfspr	r7, SPRN_SPMC2
+	mfspr	r8, SPRN_MMCRS
+	stw	r6, VCPU_PMC + 24(r9)
+	stw	r7, VCPU_PMC + 28(r9)
+	std	r8, VCPU_MMCRS(r9)
+	lis	r4, 0x8000
+	mtspr	SPRN_MMCRS, r4
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+22:	blr
+SYM_FUNC_END(kvmhv_save_guest_pmu)
+
+/*
+ * This works around a hardware bug on POWER8E processors, where
+ * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
+ * performance monitor interrupt.  Instead, when we need to have
+ * an interrupt pending, we have to arrange for a counter to overflow.
+ */
+kvmppc_fix_pmao:
+	li	r3, 0
+	mtspr	SPRN_MMCR2, r3
+	lis	r3, (MMCR0_PMXE | MMCR0_FCECE)@h
+	ori	r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN
+	mtspr	SPRN_MMCR0, r3
+	lis	r3, 0x7fff
+	ori	r3, r3, 0xffff
+	mtspr	SPRN_PMC6, r3
+	isync
+	blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET_APPL(r5)
+	mftb	r5
+	subf	r5, r6, r5	/* subtract current timebase offset */
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	std	r5, VCPU_ACTIVITY_START(r4)
+	blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r8, VCORE_TB_OFFSET_APPL(r5)
+	ld	r5, VCPU_CUR_ACTIVITY(r4)
+	ld	r6, VCPU_ACTIVITY_START(r4)
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	mftb	r7
+	subf	r7, r8, r7	/* subtract current timebase offset */
+	std	r7, VCPU_ACTIVITY_START(r4)
+	cmpdi	r5, 0
+	beqlr
+	subf	r3, r6, r7
+	ld	r8, TAS_SEQCOUNT(r5)
+	cmpdi	r8, 0
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	lwsync
+	ld	r7, TAS_TOTAL(r5)
+	add	r7, r7, r3
+	std	r7, TAS_TOTAL(r5)
+	ld	r6, TAS_MIN(r5)
+	ld	r7, TAS_MAX(r5)
+	beq	3f
+	cmpd	r3, r6
+	bge	1f
+3:	std	r3, TAS_MIN(r5)
+1:	cmpd	r3, r7
+	ble	2f
+	std	r3, TAS_MAX(r5)
+2:	lwsync
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
new file mode 100644
index 0000000000..866cadd700
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_book3s_64.h>
+#include <asm/reg.h>
+#include <asm/ppc-opcode.h>
+
+static void emulate_tx_failure(struct kvm_vcpu *vcpu, u64 failure_cause)
+{
+	u64 texasr, tfiar;
+	u64 msr = vcpu->arch.shregs.msr;
+
+	tfiar = vcpu->arch.regs.nip & ~0x3ull;
+	texasr = (failure_cause << 56) | TEXASR_ABORT | TEXASR_FS | TEXASR_EXACT;
+	if (MSR_TM_SUSPENDED(vcpu->arch.shregs.msr))
+		texasr |= TEXASR_SUSP;
+	if (msr & MSR_PR) {
+		texasr |= TEXASR_PR;
+		tfiar |= 1;
+	}
+	vcpu->arch.tfiar = tfiar;
+	/* Preserve ROT and TL fields of existing TEXASR */
+	vcpu->arch.texasr = (vcpu->arch.texasr & 0x3ffffff) | texasr;
+}
+
+/*
+ * This gets called on a softpatch interrupt on POWER9 DD2.2 processors.
+ * We expect to find a TM-related instruction to be emulated.  The
+ * instruction image is in vcpu->arch.emul_inst.  If the guest was in
+ * TM suspended or transactional state, the checkpointed state has been
+ * reclaimed and is in the vcpu struct.  The CPU is in virtual mode in
+ * host context.
+ */
+int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
+{
+	u32 instr = vcpu->arch.emul_inst;
+	u64 msr = vcpu->arch.shregs.msr;
+	u64 newmsr, bescr;
+	int ra, rs;
+
+	/*
+	 * The TM softpatch interrupt sets NIP to the instruction following
+	 * the faulting instruction, which is not executed. Rewind nip to the
+	 * faulting instruction so it looks like a normal synchronous
+	 * interrupt, then update nip in the places where the instruction is
+	 * emulated.
+	 */
+	vcpu->arch.regs.nip -= 4;
+
+	/*
+	 * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit
+	 * in these instructions, so masking bit 31 out doesn't change these
+	 * instructions. For treclaim., tsr., and trechkpt. instructions if bit
+	 * 31 = 0 then they are per ISA invalid forms, however P9 UM, in section
+	 * 4.6.10 Book II Invalid Forms, informs specifically that ignoring bit
+	 * 31 is an acceptable way to handle these invalid forms that have
+	 * bit 31 = 0. Moreover, for emulation purposes both forms (w/ and wo/
+	 * bit 31 set) can generate a softpatch interrupt. Hence both forms
+	 * are handled below for these instructions so they behave the same way.
+	 */
+	switch (instr & PO_XOP_OPCODE_MASK) {
+	case PPC_INST_RFID:
+		/* XXX do we need to check for PR=0 here? */
+		newmsr = vcpu->arch.shregs.srr1;
+		/* should only get here for Sx -> T1 transition */
+		WARN_ON_ONCE(!(MSR_TM_SUSPENDED(msr) &&
+			       MSR_TM_TRANSACTIONAL(newmsr) &&
+			       (newmsr & MSR_TM)));
+		newmsr = sanitize_msr(newmsr);
+		vcpu->arch.shregs.msr = newmsr;
+		vcpu->arch.cfar = vcpu->arch.regs.nip;
+		vcpu->arch.regs.nip = vcpu->arch.shregs.srr0;
+		return RESUME_GUEST;
+
+	case PPC_INST_RFEBB:
+		if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206)) {
+			/* generate an illegal instruction interrupt */
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+			return RESUME_GUEST;
+		}
+		/* check EBB facility is available */
+		if (!(vcpu->arch.hfscr & HFSCR_EBB)) {
+			vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+			vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56;
+			vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+			return -1; /* rerun host interrupt handler */
+		}
+		if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) {
+			/* generate a facility unavailable interrupt */
+			vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+			vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56;
+			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL);
+			return RESUME_GUEST;
+		}
+		bescr = vcpu->arch.bescr;
+		/* expect to see a S->T transition requested */
+		WARN_ON_ONCE(!(MSR_TM_SUSPENDED(msr) &&
+			       ((bescr >> 30) & 3) == 2));
+		bescr &= ~BESCR_GE;
+		if (instr & (1 << 11))
+			bescr |= BESCR_GE;
+		vcpu->arch.bescr = bescr;
+		msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
+		vcpu->arch.shregs.msr = msr;
+		vcpu->arch.cfar = vcpu->arch.regs.nip;
+		vcpu->arch.regs.nip = vcpu->arch.ebbrr;
+		return RESUME_GUEST;
+
+	case PPC_INST_MTMSRD:
+		/* XXX do we need to check for PR=0 here? */
+		rs = (instr >> 21) & 0x1f;
+		newmsr = kvmppc_get_gpr(vcpu, rs);
+		/* check this is a Sx -> T1 transition */
+		WARN_ON_ONCE(!(MSR_TM_SUSPENDED(msr) &&
+			       MSR_TM_TRANSACTIONAL(newmsr) &&
+			       (newmsr & MSR_TM)));
+		/* mtmsrd doesn't change LE */
+		newmsr = (newmsr & ~MSR_LE) | (msr & MSR_LE);
+		newmsr = sanitize_msr(newmsr);
+		vcpu->arch.shregs.msr = newmsr;
+		vcpu->arch.regs.nip += 4;
+		return RESUME_GUEST;
+
+	/* ignore bit 31, see comment above */
+	case (PPC_INST_TSR & PO_XOP_OPCODE_MASK):
+		/* check for PR=1 and arch 2.06 bit set in PCR */
+		if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206)) {
+			/* generate an illegal instruction interrupt */
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+			return RESUME_GUEST;
+		}
+		/* check for TM disabled in the HFSCR or MSR */
+		if (!(vcpu->arch.hfscr & HFSCR_TM)) {
+			vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+			vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56;
+			vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+			return -1; /* rerun host interrupt handler */
+		}
+		if (!(msr & MSR_TM)) {
+			/* generate a facility unavailable interrupt */
+			vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+			vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56;
+			kvmppc_book3s_queue_irqprio(vcpu,
+						BOOK3S_INTERRUPT_FAC_UNAVAIL);
+			return RESUME_GUEST;
+		}
+		/* Set CR0 to indicate previous transactional state */
+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
+		/* L=1 => tresume, L=0 => tsuspend */
+		if (instr & (1 << 21)) {
+			if (MSR_TM_SUSPENDED(msr))
+				msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
+		} else {
+			if (MSR_TM_TRANSACTIONAL(msr))
+				msr = (msr & ~MSR_TS_MASK) | MSR_TS_S;
+		}
+		vcpu->arch.shregs.msr = msr;
+		vcpu->arch.regs.nip += 4;
+		return RESUME_GUEST;
+
+	/* ignore bit 31, see comment above */
+	case (PPC_INST_TRECLAIM & PO_XOP_OPCODE_MASK):
+		/* check for TM disabled in the HFSCR or MSR */
+		if (!(vcpu->arch.hfscr & HFSCR_TM)) {
+			vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+			vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56;
+			vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+			return -1; /* rerun host interrupt handler */
+		}
+		if (!(msr & MSR_TM)) {
+			/* generate a facility unavailable interrupt */
+			vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+			vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56;
+			kvmppc_book3s_queue_irqprio(vcpu,
+						BOOK3S_INTERRUPT_FAC_UNAVAIL);
+			return RESUME_GUEST;
+		}
+		/* If no transaction active, generate TM bad thing */
+		if (!MSR_TM_ACTIVE(msr)) {
+			kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+			return RESUME_GUEST;
+		}
+		/* If failure was not previously recorded, recompute TEXASR */
+		if (!(vcpu->arch.orig_texasr & TEXASR_FS)) {
+			ra = (instr >> 16) & 0x1f;
+			if (ra)
+				ra = kvmppc_get_gpr(vcpu, ra) & 0xff;
+			emulate_tx_failure(vcpu, ra);
+		}
+
+		copy_from_checkpoint(vcpu);
+
+		/* Set CR0 to indicate previous transactional state */
+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
+		vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
+		vcpu->arch.regs.nip += 4;
+		return RESUME_GUEST;
+
+	/* ignore bit 31, see comment above */
+	case (PPC_INST_TRECHKPT & PO_XOP_OPCODE_MASK):
+		/* XXX do we need to check for PR=0 here? */
+		/* check for TM disabled in the HFSCR or MSR */
+		if (!(vcpu->arch.hfscr & HFSCR_TM)) {
+			vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
+			vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56;
+			vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
+			return -1; /* rerun host interrupt handler */
+		}
+		if (!(msr & MSR_TM)) {
+			/* generate a facility unavailable interrupt */
+			vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
+			vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56;
+			kvmppc_book3s_queue_irqprio(vcpu,
+						BOOK3S_INTERRUPT_FAC_UNAVAIL);
+			return RESUME_GUEST;
+		}
+		/* If transaction active or TEXASR[FS] = 0, bad thing */
+		if (MSR_TM_ACTIVE(msr) || !(vcpu->arch.texasr & TEXASR_FS)) {
+			kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+			return RESUME_GUEST;
+		}
+
+		copy_to_checkpoint(vcpu);
+
+		/* Set CR0 to indicate previous transactional state */
+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
+		vcpu->arch.shregs.msr = msr | MSR_TS_S;
+		vcpu->arch.regs.nip += 4;
+		return RESUME_GUEST;
+	}
+
+	/* What should we do here? We didn't recognize the instruction */
+	kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+	pr_warn_ratelimited("Unrecognized TM-related instruction %#x for emulation", instr);
+
+	return RESUME_GUEST;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
new file mode 100644
index 0000000000..fad931f224
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_book3s_64.h>
+#include <asm/reg.h>
+#include <asm/ppc-opcode.h>
+
+/*
+ * This handles the cases where the guest is in real suspend mode
+ * and we want to get back to the guest without dooming the transaction.
+ * The caller has checked that the guest is in real-suspend mode
+ * (MSR[TS] = S and the fake-suspend flag is not set).
+ */
+int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
+{
+	u32 instr = vcpu->arch.emul_inst;
+	u64 newmsr, msr, bescr;
+	int rs;
+
+	/*
+	 * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit
+	 * in these instructions, so masking bit 31 out doesn't change these
+	 * instructions. For the tsr. instruction if bit 31 = 0 then it is per
+	 * ISA an invalid form, however P9 UM, in section 4.6.10 Book II Invalid
+	 * Forms, informs specifically that ignoring bit 31 is an acceptable way
+	 * to handle TM-related invalid forms that have bit 31 = 0. Moreover,
+	 * for emulation purposes both forms (w/ and wo/ bit 31 set) can
+	 * generate a softpatch interrupt. Hence both forms are handled below
+	 * for tsr. to make them behave the same way.
+	 */
+	switch (instr & PO_XOP_OPCODE_MASK) {
+	case PPC_INST_RFID:
+		/* XXX do we need to check for PR=0 here? */
+		newmsr = vcpu->arch.shregs.srr1;
+		/* should only get here for Sx -> T1 transition */
+		if (!(MSR_TM_TRANSACTIONAL(newmsr) && (newmsr & MSR_TM)))
+			return 0;
+		newmsr = sanitize_msr(newmsr);
+		vcpu->arch.shregs.msr = newmsr;
+		vcpu->arch.cfar = vcpu->arch.regs.nip - 4;
+		vcpu->arch.regs.nip = vcpu->arch.shregs.srr0;
+		return 1;
+
+	case PPC_INST_RFEBB:
+		/* check for PR=1 and arch 2.06 bit set in PCR */
+		msr = vcpu->arch.shregs.msr;
+		if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206))
+			return 0;
+		/* check EBB facility is available */
+		if (!(vcpu->arch.hfscr & HFSCR_EBB) ||
+		    ((msr & MSR_PR) && !(mfspr(SPRN_FSCR) & FSCR_EBB)))
+			return 0;
+		bescr = mfspr(SPRN_BESCR);
+		/* expect to see a S->T transition requested */
+		if (((bescr >> 30) & 3) != 2)
+			return 0;
+		bescr &= ~BESCR_GE;
+		if (instr & (1 << 11))
+			bescr |= BESCR_GE;
+		mtspr(SPRN_BESCR, bescr);
+		msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
+		vcpu->arch.shregs.msr = msr;
+		vcpu->arch.cfar = vcpu->arch.regs.nip - 4;
+		vcpu->arch.regs.nip = mfspr(SPRN_EBBRR);
+		return 1;
+
+	case PPC_INST_MTMSRD:
+		/* XXX do we need to check for PR=0 here? */
+		rs = (instr >> 21) & 0x1f;
+		newmsr = kvmppc_get_gpr(vcpu, rs);
+		msr = vcpu->arch.shregs.msr;
+		/* check this is a Sx -> T1 transition */
+		if (!(MSR_TM_TRANSACTIONAL(newmsr) && (newmsr & MSR_TM)))
+			return 0;
+		/* mtmsrd doesn't change LE */
+		newmsr = (newmsr & ~MSR_LE) | (msr & MSR_LE);
+		newmsr = sanitize_msr(newmsr);
+		vcpu->arch.shregs.msr = newmsr;
+		return 1;
+
+	/* ignore bit 31, see comment above */
+	case (PPC_INST_TSR & PO_XOP_OPCODE_MASK):
+		/* we know the MSR has the TS field = S (0b01) here */
+		msr = vcpu->arch.shregs.msr;
+		/* check for PR=1 and arch 2.06 bit set in PCR */
+		if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206))
+			return 0;
+		/* check for TM disabled in the HFSCR or MSR */
+		if (!(vcpu->arch.hfscr & HFSCR_TM) || !(msr & MSR_TM))
+			return 0;
+		/* L=1 => tresume => set TS to T (0b10) */
+		if (instr & (1 << 21))
+			vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
+		/* Set CR0 to 0b0010 */
+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+			0x20000000;
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * This is called when we are returning to a guest in TM transactional
+ * state.  We roll the guest state back to the checkpointed state.
+ */
+void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.shregs.msr &= ~MSR_TS_MASK;	/* go to N state */
+	vcpu->arch.regs.nip = vcpu->arch.tfhar;
+	copy_from_checkpoint(vcpu);
+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
new file mode 100644
index 0000000000..e2d6f9327f
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Secure pages management: Migration of pages between normal and secure
+ * memory of KVM guests.
+ *
+ * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
+ */
+
+/*
+ * A pseries guest can be run as secure guest on Ultravisor-enabled
+ * POWER platforms. On such platforms, this driver will be used to manage
+ * the movement of guest pages between the normal memory managed by
+ * hypervisor (HV) and secure memory managed by Ultravisor (UV).
+ *
+ * The page-in or page-out requests from UV will come to HV as hcalls and
+ * HV will call back into UV via ultracalls to satisfy these page requests.
+ *
+ * Private ZONE_DEVICE memory equal to the amount of secure memory
+ * available in the platform for running secure guests is hotplugged.
+ * Whenever a page belonging to the guest becomes secure, a page from this
+ * private device memory is used to represent and track that secure page
+ * on the HV side. Some pages (like virtio buffers, VPA pages etc) are
+ * shared between UV and HV. However such pages aren't represented by
+ * device private memory and mappings to shared memory exist in both
+ * UV and HV page tables.
+ */
+
+/*
+ * Notes on locking
+ *
+ * kvm->arch.uvmem_lock is a per-guest lock that prevents concurrent
+ * page-in and page-out requests for the same GPA. Concurrent accesses
+ * can either come via UV (guest vCPUs requesting for same page)
+ * or when HV and guest simultaneously access the same page.
+ * This mutex serializes the migration of page from HV(normal) to
+ * UV(secure) and vice versa. So the serialization points are around
+ * migrate_vma routines and page-in/out routines.
+ *
+ * Per-guest mutex comes with a cost though. Mainly it serializes the
+ * fault path as page-out can occur when HV faults on accessing secure
+ * guest pages. Currently UV issues page-in requests for all the guest
+ * PFNs one at a time during early boot (UV_ESM uvcall), so this is
+ * not a cause for concern. Also currently the number of page-outs caused
+ * by HV touching secure pages is very very low. If an when UV supports
+ * overcommitting, then we might see concurrent guest driven page-outs.
+ *
+ * Locking order
+ *
+ * 1. kvm->srcu - Protects KVM memslots
+ * 2. kvm->mm->mmap_lock - find_vma, migrate_vma_pages and helpers, ksm_madvise
+ * 3. kvm->arch.uvmem_lock - protects read/writes to uvmem slots thus acting
+ *			     as sync-points for page-in/out
+ */
+
+/*
+ * Notes on page size
+ *
+ * Currently UV uses 2MB mappings internally, but will issue H_SVM_PAGE_IN
+ * and H_SVM_PAGE_OUT hcalls in PAGE_SIZE(64K) granularity. HV tracks
+ * secure GPAs at 64K page size and maintains one device PFN for each
+ * 64K secure GPA. UV_PAGE_IN and UV_PAGE_OUT calls by HV are also issued
+ * for 64K page at a time.
+ *
+ * HV faulting on secure pages: When HV touches any secure page, it
+ * faults and issues a UV_PAGE_OUT request with 64K page size. Currently
+ * UV splits and remaps the 2MB page if necessary and copies out the
+ * required 64K page contents.
+ *
+ * Shared pages: Whenever guest shares a secure page, UV will split and
+ * remap the 2MB page if required and issue H_SVM_PAGE_IN with 64K page size.
+ *
+ * HV invalidating a page: When a regular page belonging to secure
+ * guest gets unmapped, HV informs UV with UV_PAGE_INVAL of 64K
+ * page size. Using 64K page size is correct here because any non-secure
+ * page will essentially be of 64K page size. Splitting by UV during sharing
+ * and page-out ensures this.
+ *
+ * Page fault handling: When HV handles page fault of a page belonging
+ * to secure guest, it sends that to UV with a 64K UV_PAGE_IN request.
+ * Using 64K size is correct here too as UV would have split the 2MB page
+ * into 64k mappings and would have done page-outs earlier.
+ *
+ * In summary, the current secure pages handling code in HV assumes
+ * 64K page size and in fact fails any page-in/page-out requests of
+ * non-64K size upfront. If and when UV starts supporting multiple
+ * page-sizes, we need to break this assumption.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/migrate.h>
+#include <linux/kvm_host.h>
+#include <linux/ksm.h>
+#include <linux/of.h>
+#include <linux/memremap.h>
+#include <asm/ultravisor.h>
+#include <asm/mman.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_uvmem.h>
+
+static struct dev_pagemap kvmppc_uvmem_pgmap;
+static unsigned long *kvmppc_uvmem_bitmap;
+static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock);
+
+/*
+ * States of a GFN
+ * ---------------
+ * The GFN can be in one of the following states.
+ *
+ * (a) Secure - The GFN is secure. The GFN is associated with
+ *	a Secure VM, the contents of the GFN is not accessible
+ *	to the Hypervisor.  This GFN can be backed by a secure-PFN,
+ *	or can be backed by a normal-PFN with contents encrypted.
+ *	The former is true when the GFN is paged-in into the
+ *	ultravisor. The latter is true when the GFN is paged-out
+ *	of the ultravisor.
+ *
+ * (b) Shared - The GFN is shared. The GFN is associated with a
+ *	a secure VM. The contents of the GFN is accessible to
+ *	Hypervisor. This GFN is backed by a normal-PFN and its
+ *	content is un-encrypted.
+ *
+ * (c) Normal - The GFN is a normal. The GFN is associated with
+ *	a normal VM. The contents of the GFN is accessible to
+ *	the Hypervisor. Its content is never encrypted.
+ *
+ * States of a VM.
+ * ---------------
+ *
+ * Normal VM:  A VM whose contents are always accessible to
+ *	the hypervisor.  All its GFNs are normal-GFNs.
+ *
+ * Secure VM: A VM whose contents are not accessible to the
+ *	hypervisor without the VM's consent.  Its GFNs are
+ *	either Shared-GFN or Secure-GFNs.
+ *
+ * Transient VM: A Normal VM that is transitioning to secure VM.
+ *	The transition starts on successful return of
+ *	H_SVM_INIT_START, and ends on successful return
+ *	of H_SVM_INIT_DONE. This transient VM, can have GFNs
+ *	in any of the three states; i.e Secure-GFN, Shared-GFN,
+ *	and Normal-GFN.	The VM never executes in this state
+ *	in supervisor-mode.
+ *
+ * Memory slot State.
+ * -----------------------------
+ *	The state of a memory slot mirrors the state of the
+ *	VM the memory slot is associated with.
+ *
+ * VM State transition.
+ * --------------------
+ *
+ *  A VM always starts in Normal Mode.
+ *
+ *  H_SVM_INIT_START moves the VM into transient state. During this
+ *  time the Ultravisor may request some of its GFNs to be shared or
+ *  secured. So its GFNs can be in one of the three GFN states.
+ *
+ *  H_SVM_INIT_DONE moves the VM entirely from transient state to
+ *  secure-state. At this point any left-over normal-GFNs are
+ *  transitioned to Secure-GFN.
+ *
+ *  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
+ *  All its GFNs are moved to Normal-GFNs.
+ *
+ *  UV_TERMINATE transitions the secure-VM back to normal-VM. All
+ *  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
+ *  Note: The contents of the normal-GFN is undefined at this point.
+ *
+ * GFN state implementation:
+ * -------------------------
+ *
+ * Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
+ * when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
+ * set, and contains the value of the secure-PFN.
+ * It is associated with a normal-PFN; also called mem_pfn, when
+ * the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
+ * The value of the normal-PFN is not tracked.
+ *
+ * Shared GFN is associated with a normal-PFN. Its pfn[] has
+ * KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
+ * is not tracked.
+ *
+ * Normal GFN is associated with normal-PFN. Its pfn[] has
+ * no flag set. The value of the normal-PFN is not tracked.
+ *
+ * Life cycle of a GFN
+ * --------------------
+ *
+ * --------------------------------------------------------------
+ * |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
+ * |        |operation   |operation | abort/    |               |
+ * |        |            |          | terminate |               |
+ * -------------------------------------------------------------
+ * |        |            |          |           |               |
+ * | Secure |     Shared | Secure   |Normal     |Secure         |
+ * |        |            |          |           |               |
+ * | Shared |     Shared | Secure   |Normal     |Shared         |
+ * |        |            |          |           |               |
+ * | Normal |     Shared | Secure   |Normal     |Secure         |
+ * --------------------------------------------------------------
+ *
+ * Life cycle of a VM
+ * --------------------
+ *
+ * --------------------------------------------------------------------
+ * |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
+ * |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
+ * |         |           |          |         |           |           |
+ * --------- ----------------------------------------------------------
+ * |         |           |          |         |           |           |
+ * | Normal  | Normal    | Transient|Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * | Secure  |   Error   | Error    |Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
+ * --------------------------------------------------------------------
+ */
+
+#define KVMPPC_GFN_UVMEM_PFN	(1UL << 63)
+#define KVMPPC_GFN_MEM_PFN	(1UL << 62)
+#define KVMPPC_GFN_SHARED	(1UL << 61)
+#define KVMPPC_GFN_SECURE	(KVMPPC_GFN_UVMEM_PFN | KVMPPC_GFN_MEM_PFN)
+#define KVMPPC_GFN_FLAG_MASK	(KVMPPC_GFN_SECURE | KVMPPC_GFN_SHARED)
+#define KVMPPC_GFN_PFN_MASK	(~KVMPPC_GFN_FLAG_MASK)
+
+struct kvmppc_uvmem_slot {
+	struct list_head list;
+	unsigned long nr_pfns;
+	unsigned long base_pfn;
+	unsigned long *pfns;
+};
+struct kvmppc_uvmem_page_pvt {
+	struct kvm *kvm;
+	unsigned long gpa;
+	bool skip_page_out;
+	bool remove_gfn;
+};
+
+bool kvmppc_uvmem_available(void)
+{
+	/*
+	 * If kvmppc_uvmem_bitmap != NULL, then there is an ultravisor
+	 * and our data structures have been initialized successfully.
+	 */
+	return !!kvmppc_uvmem_bitmap;
+}
+
+int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot)
+{
+	struct kvmppc_uvmem_slot *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->pfns = vcalloc(slot->npages, sizeof(*p->pfns));
+	if (!p->pfns) {
+		kfree(p);
+		return -ENOMEM;
+	}
+	p->nr_pfns = slot->npages;
+	p->base_pfn = slot->base_gfn;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	list_add(&p->list, &kvm->arch.uvmem_pfns);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+
+	return 0;
+}
+
+/*
+ * All device PFNs are already released by the time we come here.
+ */
+void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot)
+{
+	struct kvmppc_uvmem_slot *p, *next;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	list_for_each_entry_safe(p, next, &kvm->arch.uvmem_pfns, list) {
+		if (p->base_pfn == slot->base_gfn) {
+			vfree(p->pfns);
+			list_del(&p->list);
+			kfree(p);
+			break;
+		}
+	}
+	mutex_unlock(&kvm->arch.uvmem_lock);
+}
+
+static void kvmppc_mark_gfn(unsigned long gfn, struct kvm *kvm,
+			unsigned long flag, unsigned long uvmem_pfn)
+{
+	struct kvmppc_uvmem_slot *p;
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
+		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
+			unsigned long index = gfn - p->base_pfn;
+
+			if (flag == KVMPPC_GFN_UVMEM_PFN)
+				p->pfns[index] = uvmem_pfn | flag;
+			else
+				p->pfns[index] = flag;
+			return;
+		}
+	}
+}
+
+/* mark the GFN as secure-GFN associated with @uvmem pfn device-PFN. */
+static void kvmppc_gfn_secure_uvmem_pfn(unsigned long gfn,
+			unsigned long uvmem_pfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_UVMEM_PFN, uvmem_pfn);
+}
+
+/* mark the GFN as secure-GFN associated with a memory-PFN. */
+static void kvmppc_gfn_secure_mem_pfn(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_MEM_PFN, 0);
+}
+
+/* mark the GFN as a shared GFN. */
+static void kvmppc_gfn_shared(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_SHARED, 0);
+}
+
+/* mark the GFN as a non-existent GFN. */
+static void kvmppc_gfn_remove(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, 0, 0);
+}
+
+/* return true, if the GFN is a secure-GFN backed by a secure-PFN */
+static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
+				    unsigned long *uvmem_pfn)
+{
+	struct kvmppc_uvmem_slot *p;
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
+		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
+			unsigned long index = gfn - p->base_pfn;
+
+			if (p->pfns[index] & KVMPPC_GFN_UVMEM_PFN) {
+				if (uvmem_pfn)
+					*uvmem_pfn = p->pfns[index] &
+						     KVMPPC_GFN_PFN_MASK;
+				return true;
+			} else
+				return false;
+		}
+	}
+	return false;
+}
+
+/*
+ * starting from *gfn search for the next available GFN that is not yet
+ * transitioned to a secure GFN.  return the value of that GFN in *gfn.  If a
+ * GFN is found, return true, else return false
+ *
+ * Must be called with kvm->arch.uvmem_lock  held.
+ */
+static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
+		struct kvm *kvm, unsigned long *gfn)
+{
+	struct kvmppc_uvmem_slot *p = NULL, *iter;
+	bool ret = false;
+	unsigned long i;
+
+	list_for_each_entry(iter, &kvm->arch.uvmem_pfns, list)
+		if (*gfn >= iter->base_pfn && *gfn < iter->base_pfn + iter->nr_pfns) {
+			p = iter;
+			break;
+		}
+	if (!p)
+		return ret;
+	/*
+	 * The code below assumes, one to one correspondence between
+	 * kvmppc_uvmem_slot and memslot.
+	 */
+	for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) {
+		unsigned long index = i - p->base_pfn;
+
+		if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) {
+			*gfn = i;
+			ret = true;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int kvmppc_memslot_page_merge(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot, bool merge)
+{
+	unsigned long gfn = memslot->base_gfn;
+	unsigned long end, start = gfn_to_hva(kvm, gfn);
+	unsigned long vm_flags;
+	int ret = 0;
+	struct vm_area_struct *vma;
+	int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
+
+	if (kvm_is_error_hva(start))
+		return H_STATE;
+
+	end = start + (memslot->npages << PAGE_SHIFT);
+
+	mmap_write_lock(kvm->mm);
+	do {
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma) {
+			ret = H_STATE;
+			break;
+		}
+		vma_start_write(vma);
+		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
+		vm_flags = vma->vm_flags;
+		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+			  merge_flag, &vm_flags);
+		if (ret) {
+			ret = H_STATE;
+			break;
+		}
+		vm_flags_reset(vma, vm_flags);
+		start = vma->vm_end;
+	} while (end > vma->vm_end);
+
+	mmap_write_unlock(kvm->mm);
+	return ret;
+}
+
+static void __kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
+	kvmppc_uvmem_slot_free(kvm, memslot);
+	kvmppc_memslot_page_merge(kvm, memslot, true);
+}
+
+static int __kvmppc_uvmem_memslot_create(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	int ret = H_PARAMETER;
+
+	if (kvmppc_memslot_page_merge(kvm, memslot, false))
+		return ret;
+
+	if (kvmppc_uvmem_slot_init(kvm, memslot))
+		goto out1;
+
+	ret = uv_register_mem_slot(kvm->arch.lpid,
+				   memslot->base_gfn << PAGE_SHIFT,
+				   memslot->npages * PAGE_SIZE,
+				   0, memslot->id);
+	if (ret < 0) {
+		ret = H_PARAMETER;
+		goto out;
+	}
+	return 0;
+out:
+	kvmppc_uvmem_slot_free(kvm, memslot);
+out1:
+	kvmppc_memslot_page_merge(kvm, memslot, true);
+	return ret;
+}
+
+unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot, *m;
+	int ret = H_SUCCESS;
+	int srcu_idx, bkt;
+
+	kvm->arch.secure_guest = KVMPPC_SECURE_INIT_START;
+
+	if (!kvmppc_uvmem_bitmap)
+		return H_UNSUPPORTED;
+
+	/* Only radix guests can be secure guests */
+	if (!kvm_is_radix(kvm))
+		return H_UNSUPPORTED;
+
+	/* NAK the transition to secure if not enabled */
+	if (!kvm->arch.svm_enabled)
+		return H_AUTHORITY;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	/* register the memslot */
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, bkt, slots) {
+		ret = __kvmppc_uvmem_memslot_create(kvm, memslot);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		slots = kvm_memslots(kvm);
+		kvm_for_each_memslot(m, bkt, slots) {
+			if (m == memslot)
+				break;
+			__kvmppc_uvmem_memslot_delete(kvm, memslot);
+		}
+	}
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
+}
+
+/*
+ * Provision a new page on HV side and copy over the contents
+ * from secure memory using UV_PAGE_OUT uvcall.
+ * Caller must held kvm->arch.uvmem_lock.
+ */
+static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long page_shift,
+		struct kvm *kvm, unsigned long gpa, struct page *fault_page)
+{
+	unsigned long src_pfn, dst_pfn = 0;
+	struct migrate_vma mig = { 0 };
+	struct page *dpage, *spage;
+	struct kvmppc_uvmem_page_pvt *pvt;
+	unsigned long pfn;
+	int ret = U_SUCCESS;
+
+	memset(&mig, 0, sizeof(mig));
+	mig.vma = vma;
+	mig.start = start;
+	mig.end = end;
+	mig.src = &src_pfn;
+	mig.dst = &dst_pfn;
+	mig.pgmap_owner = &kvmppc_uvmem_pgmap;
+	mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+	mig.fault_page = fault_page;
+
+	/* The requested page is already paged-out, nothing to do */
+	if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
+		return ret;
+
+	ret = migrate_vma_setup(&mig);
+	if (ret)
+		return -1;
+
+	spage = migrate_pfn_to_page(*mig.src);
+	if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE))
+		goto out_finalize;
+
+	if (!is_zone_device_page(spage))
+		goto out_finalize;
+
+	dpage = alloc_page_vma(GFP_HIGHUSER, vma, start);
+	if (!dpage) {
+		ret = -1;
+		goto out_finalize;
+	}
+
+	lock_page(dpage);
+	pvt = spage->zone_device_data;
+	pfn = page_to_pfn(dpage);
+
+	/*
+	 * This function is used in two cases:
+	 * - When HV touches a secure page, for which we do UV_PAGE_OUT
+	 * - When a secure page is converted to shared page, we *get*
+	 *   the page to essentially unmap the device page. In this
+	 *   case we skip page-out.
+	 */
+	if (!pvt->skip_page_out)
+		ret = uv_page_out(kvm->arch.lpid, pfn << page_shift,
+				  gpa, 0, page_shift);
+
+	if (ret == U_SUCCESS)
+		*mig.dst = migrate_pfn(pfn);
+	else {
+		unlock_page(dpage);
+		__free_page(dpage);
+		goto out_finalize;
+	}
+
+	migrate_vma_pages(&mig);
+
+out_finalize:
+	migrate_vma_finalize(&mig);
+	return ret;
+}
+
+static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
+				      unsigned long start, unsigned long end,
+				      unsigned long page_shift,
+				      struct kvm *kvm, unsigned long gpa,
+				      struct page *fault_page)
+{
+	int ret;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
+				fault_page);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+
+	return ret;
+}
+
+/*
+ * Drop device pages that we maintain for the secure guest
+ *
+ * We first mark the pages to be skipped from UV_PAGE_OUT when there
+ * is HV side fault on these pages. Next we *get* these pages, forcing
+ * fault on them, do fault time migration to replace the device PTEs in
+ * QEMU page table with normal PTEs from newly allocated pages.
+ */
+void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
+			     struct kvm *kvm, bool skip_page_out)
+{
+	int i;
+	struct kvmppc_uvmem_page_pvt *pvt;
+	struct page *uvmem_page;
+	struct vm_area_struct *vma = NULL;
+	unsigned long uvmem_pfn, gfn;
+	unsigned long addr;
+
+	mmap_read_lock(kvm->mm);
+
+	addr = slot->userspace_addr;
+
+	gfn = slot->base_gfn;
+	for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
+
+		/* Fetch the VMA if addr is not in the latest fetched one */
+		if (!vma || addr >= vma->vm_end) {
+			vma = vma_lookup(kvm->mm, addr);
+			if (!vma) {
+				pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
+				break;
+			}
+		}
+
+		mutex_lock(&kvm->arch.uvmem_lock);
+
+		if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			uvmem_page = pfn_to_page(uvmem_pfn);
+			pvt = uvmem_page->zone_device_data;
+			pvt->skip_page_out = skip_page_out;
+			pvt->remove_gfn = true;
+
+			if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
+						  PAGE_SHIFT, kvm, pvt->gpa, NULL))
+				pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
+				       pvt->gpa, addr);
+		} else {
+			/* Remove the shared flag if any */
+			kvmppc_gfn_remove(gfn, kvm);
+		}
+
+		mutex_unlock(&kvm->arch.uvmem_lock);
+	}
+
+	mmap_read_unlock(kvm->mm);
+}
+
+unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
+{
+	int srcu_idx, bkt;
+	struct kvm_memory_slot *memslot;
+
+	/*
+	 * Expect to be called only after INIT_START and before INIT_DONE.
+	 * If INIT_DONE was completed, use normal VM termination sequence.
+	 */
+	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
+		return H_UNSUPPORTED;
+
+	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
+		return H_STATE;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	kvm_for_each_memslot(memslot, bkt, kvm_memslots(kvm))
+		kvmppc_uvmem_drop_pages(memslot, kvm, false);
+
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	kvm->arch.secure_guest = 0;
+	uv_svm_terminate(kvm->arch.lpid);
+
+	return H_PARAMETER;
+}
+
+/*
+ * Get a free device PFN from the pool
+ *
+ * Called when a normal page is moved to secure memory (UV_PAGE_IN). Device
+ * PFN will be used to keep track of the secure page on HV side.
+ *
+ * Called with kvm->arch.uvmem_lock held
+ */
+static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
+{
+	struct page *dpage = NULL;
+	unsigned long bit, uvmem_pfn;
+	struct kvmppc_uvmem_page_pvt *pvt;
+	unsigned long pfn_last, pfn_first;
+
+	pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT;
+	pfn_last = pfn_first +
+		   (range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT);
+
+	spin_lock(&kvmppc_uvmem_bitmap_lock);
+	bit = find_first_zero_bit(kvmppc_uvmem_bitmap,
+				  pfn_last - pfn_first);
+	if (bit >= (pfn_last - pfn_first))
+		goto out;
+	bitmap_set(kvmppc_uvmem_bitmap, bit, 1);
+	spin_unlock(&kvmppc_uvmem_bitmap_lock);
+
+	pvt = kzalloc(sizeof(*pvt), GFP_KERNEL);
+	if (!pvt)
+		goto out_clear;
+
+	uvmem_pfn = bit + pfn_first;
+	kvmppc_gfn_secure_uvmem_pfn(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
+
+	pvt->gpa = gpa;
+	pvt->kvm = kvm;
+
+	dpage = pfn_to_page(uvmem_pfn);
+	dpage->zone_device_data = pvt;
+	zone_device_page_init(dpage);
+	return dpage;
+out_clear:
+	spin_lock(&kvmppc_uvmem_bitmap_lock);
+	bitmap_clear(kvmppc_uvmem_bitmap, bit, 1);
+out:
+	spin_unlock(&kvmppc_uvmem_bitmap_lock);
+	return NULL;
+}
+
+/*
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
+ */
+static int kvmppc_svm_page_in(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long gpa, struct kvm *kvm,
+		unsigned long page_shift,
+		bool pagein)
+{
+	unsigned long src_pfn, dst_pfn = 0;
+	struct migrate_vma mig = { 0 };
+	struct page *spage;
+	unsigned long pfn;
+	struct page *dpage;
+	int ret = 0;
+
+	memset(&mig, 0, sizeof(mig));
+	mig.vma = vma;
+	mig.start = start;
+	mig.end = end;
+	mig.src = &src_pfn;
+	mig.dst = &dst_pfn;
+	mig.flags = MIGRATE_VMA_SELECT_SYSTEM;
+
+	ret = migrate_vma_setup(&mig);
+	if (ret)
+		return ret;
+
+	if (!(*mig.src & MIGRATE_PFN_MIGRATE)) {
+		ret = -1;
+		goto out_finalize;
+	}
+
+	dpage = kvmppc_uvmem_get_page(gpa, kvm);
+	if (!dpage) {
+		ret = -1;
+		goto out_finalize;
+	}
+
+	if (pagein) {
+		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+		spage = migrate_pfn_to_page(*mig.src);
+		if (spage) {
+			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+					gpa, 0, page_shift);
+			if (ret)
+				goto out_finalize;
+		}
+	}
+
+	*mig.dst = migrate_pfn(page_to_pfn(dpage));
+	migrate_vma_pages(&mig);
+out_finalize:
+	migrate_vma_finalize(&mig);
+	return ret;
+}
+
+static int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn = memslot->base_gfn;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+	int ret = 0;
+
+	mmap_read_lock(kvm->mm);
+	mutex_lock(&kvm->arch.uvmem_lock);
+	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
+		ret = H_STATE;
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start))
+			break;
+
+		end = start + (1UL << PAGE_SHIFT);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end)
+			break;
+
+		ret = kvmppc_svm_page_in(vma, start, end,
+				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+		if (ret) {
+			ret = H_STATE;
+			break;
+		}
+
+		/* relinquish the cpu if needed */
+		cond_resched();
+	}
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	mmap_read_unlock(kvm->mm);
+	return ret;
+}
+
+unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx, bkt;
+	long ret = H_SUCCESS;
+
+	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
+		return H_UNSUPPORTED;
+
+	/* migrate any unmoved normal pfn to device pfns*/
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, bkt, slots) {
+		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+		if (ret) {
+			/*
+			 * The pages will remain transitioned.
+			 * Its the callers responsibility to
+			 * terminate the VM, which will undo
+			 * all state of the VM. Till then
+			 * this VM is in a erroneous state.
+			 * Its KVMPPC_SECURE_INIT_DONE will
+			 * remain unset.
+			 */
+			ret = H_STATE;
+			goto out;
+		}
+	}
+
+	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
+	pr_info("LPID %d went secure\n", kvm->arch.lpid);
+
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
+}
+
+/*
+ * Shares the page with HV, thus making it a normal page.
+ *
+ * - If the page is already secure, then provision a new page and share
+ * - If the page is a normal page, share the existing page
+ *
+ * In the former case, uses dev_pagemap_ops.migrate_to_ram handler
+ * to unmap the device page from QEMU's page tables.
+ */
+static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
+		unsigned long page_shift)
+{
+
+	int ret = H_PARAMETER;
+	struct page *uvmem_page;
+	struct kvmppc_uvmem_page_pvt *pvt;
+	unsigned long pfn;
+	unsigned long gfn = gpa >> page_shift;
+	int srcu_idx;
+	unsigned long uvmem_pfn;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	mutex_lock(&kvm->arch.uvmem_lock);
+	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+		uvmem_page = pfn_to_page(uvmem_pfn);
+		pvt = uvmem_page->zone_device_data;
+		pvt->skip_page_out = true;
+		/*
+		 * do not drop the GFN. It is a valid GFN
+		 * that is transitioned to a shared GFN.
+		 */
+		pvt->remove_gfn = false;
+	}
+
+retry:
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	pfn = gfn_to_pfn(kvm, gfn);
+	if (is_error_noslot_pfn(pfn))
+		goto out;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+		uvmem_page = pfn_to_page(uvmem_pfn);
+		pvt = uvmem_page->zone_device_data;
+		pvt->skip_page_out = true;
+		pvt->remove_gfn = false; /* it continues to be a valid GFN */
+		kvm_release_pfn_clean(pfn);
+		goto retry;
+	}
+
+	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
+				page_shift)) {
+		kvmppc_gfn_shared(gfn, kvm);
+		ret = H_SUCCESS;
+	}
+	kvm_release_pfn_clean(pfn);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
+}
+
+/*
+ * H_SVM_PAGE_IN: Move page from normal memory to secure memory.
+ *
+ * H_PAGE_IN_SHARED flag makes the page shared which means that the same
+ * memory in is visible from both UV and HV.
+ */
+unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+		unsigned long flags,
+		unsigned long page_shift)
+{
+	unsigned long start, end;
+	struct vm_area_struct *vma;
+	int srcu_idx;
+	unsigned long gfn = gpa >> page_shift;
+	int ret;
+
+	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
+		return H_UNSUPPORTED;
+
+	if (page_shift != PAGE_SHIFT)
+		return H_P3;
+
+	if (flags & ~H_PAGE_IN_SHARED)
+		return H_P2;
+
+	if (flags & H_PAGE_IN_SHARED)
+		return kvmppc_share_page(kvm, gpa, page_shift);
+
+	ret = H_PARAMETER;
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	mmap_read_lock(kvm->mm);
+
+	start = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(start))
+		goto out;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	/* Fail the page-in request of an already paged-in page */
+	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL))
+		goto out_unlock;
+
+	end = start + (1UL << page_shift);
+	vma = find_vma_intersection(kvm->mm, start, end);
+	if (!vma || vma->vm_start > start || vma->vm_end < end)
+		goto out_unlock;
+
+	if (kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
+				true))
+		goto out_unlock;
+
+	ret = H_SUCCESS;
+
+out_unlock:
+	mutex_unlock(&kvm->arch.uvmem_lock);
+out:
+	mmap_read_unlock(kvm->mm);
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
+}
+
+
+/*
+ * Fault handler callback that gets called when HV touches any page that
+ * has been moved to secure memory, we ask UV to give back the page by
+ * issuing UV_PAGE_OUT uvcall.
+ *
+ * This eventually results in dropping of device PFN and the newly
+ * provisioned page/PFN gets populated in QEMU page tables.
+ */
+static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
+{
+	struct kvmppc_uvmem_page_pvt *pvt = vmf->page->zone_device_data;
+
+	if (kvmppc_svm_page_out(vmf->vma, vmf->address,
+				vmf->address + PAGE_SIZE, PAGE_SHIFT,
+				pvt->kvm, pvt->gpa, vmf->page))
+		return VM_FAULT_SIGBUS;
+	else
+		return 0;
+}
+
+/*
+ * Release the device PFN back to the pool
+ *
+ * Gets called when secure GFN tranistions from a secure-PFN
+ * to a normal PFN during H_SVM_PAGE_OUT.
+ * Gets called with kvm->arch.uvmem_lock held.
+ */
+static void kvmppc_uvmem_page_free(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page) -
+			(kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT);
+	struct kvmppc_uvmem_page_pvt *pvt;
+
+	spin_lock(&kvmppc_uvmem_bitmap_lock);
+	bitmap_clear(kvmppc_uvmem_bitmap, pfn, 1);
+	spin_unlock(&kvmppc_uvmem_bitmap_lock);
+
+	pvt = page->zone_device_data;
+	page->zone_device_data = NULL;
+	if (pvt->remove_gfn)
+		kvmppc_gfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	else
+		kvmppc_gfn_secure_mem_pfn(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	kfree(pvt);
+}
+
+static const struct dev_pagemap_ops kvmppc_uvmem_ops = {
+	.page_free = kvmppc_uvmem_page_free,
+	.migrate_to_ram	= kvmppc_uvmem_migrate_to_ram,
+};
+
+/*
+ * H_SVM_PAGE_OUT: Move page from secure memory to normal memory.
+ */
+unsigned long
+kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
+		      unsigned long flags, unsigned long page_shift)
+{
+	unsigned long gfn = gpa >> page_shift;
+	unsigned long start, end;
+	struct vm_area_struct *vma;
+	int srcu_idx;
+	int ret;
+
+	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
+		return H_UNSUPPORTED;
+
+	if (page_shift != PAGE_SHIFT)
+		return H_P3;
+
+	if (flags)
+		return H_P2;
+
+	ret = H_PARAMETER;
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	mmap_read_lock(kvm->mm);
+	start = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(start))
+		goto out;
+
+	end = start + (1UL << page_shift);
+	vma = find_vma_intersection(kvm->mm, start, end);
+	if (!vma || vma->vm_start > start || vma->vm_end < end)
+		goto out;
+
+	if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL))
+		ret = H_SUCCESS;
+out:
+	mmap_read_unlock(kvm->mm);
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
+}
+
+int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
+{
+	unsigned long pfn;
+	int ret = U_SUCCESS;
+
+	pfn = gfn_to_pfn(kvm, gfn);
+	if (is_error_noslot_pfn(pfn))
+		return -EFAULT;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL))
+		goto out;
+
+	ret = uv_page_in(kvm->arch.lpid, pfn << PAGE_SHIFT, gfn << PAGE_SHIFT,
+			 0, PAGE_SHIFT);
+out:
+	kvm_release_pfn_clean(pfn);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT;
+}
+
+int kvmppc_uvmem_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new)
+{
+	int ret = __kvmppc_uvmem_memslot_create(kvm, new);
+
+	if (!ret)
+		ret = kvmppc_uv_migrate_mem_slot(kvm, new);
+
+	return ret;
+}
+
+void kvmppc_uvmem_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *old)
+{
+	__kvmppc_uvmem_memslot_delete(kvm, old);
+}
+
+static u64 kvmppc_get_secmem_size(void)
+{
+	struct device_node *np;
+	int i, len;
+	const __be32 *prop;
+	u64 size = 0;
+
+	/*
+	 * First try the new ibm,secure-memory nodes which supersede the
+	 * secure-memory-ranges property.
+	 * If we found some, no need to read the deprecated ones.
+	 */
+	for_each_compatible_node(np, NULL, "ibm,secure-memory") {
+		prop = of_get_property(np, "reg", &len);
+		if (!prop)
+			continue;
+		size += of_read_number(prop + 2, 2);
+	}
+	if (size)
+		return size;
+
+	np = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+	if (!np)
+		goto out;
+
+	prop = of_get_property(np, "secure-memory-ranges", &len);
+	if (!prop)
+		goto out_put;
+
+	for (i = 0; i < len / (sizeof(*prop) * 4); i++)
+		size += of_read_number(prop + (i * 4) + 2, 2);
+
+out_put:
+	of_node_put(np);
+out:
+	return size;
+}
+
+int kvmppc_uvmem_init(void)
+{
+	int ret = 0;
+	unsigned long size;
+	struct resource *res;
+	void *addr;
+	unsigned long pfn_last, pfn_first;
+
+	size = kvmppc_get_secmem_size();
+	if (!size) {
+		/*
+		 * Don't fail the initialization of kvm-hv module if
+		 * the platform doesn't export ibm,uv-firmware node.
+		 * Let normal guests run on such PEF-disabled platform.
+		 */
+		pr_info("KVMPPC-UVMEM: No support for secure guests\n");
+		goto out;
+	}
+
+	res = request_free_mem_region(&iomem_resource, size, "kvmppc_uvmem");
+	if (IS_ERR(res)) {
+		ret = PTR_ERR(res);
+		goto out;
+	}
+
+	kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
+	kvmppc_uvmem_pgmap.range.start = res->start;
+	kvmppc_uvmem_pgmap.range.end = res->end;
+	kvmppc_uvmem_pgmap.nr_range = 1;
+	kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops;
+	/* just one global instance: */
+	kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap;
+	addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE);
+	if (IS_ERR(addr)) {
+		ret = PTR_ERR(addr);
+		goto out_free_region;
+	}
+
+	pfn_first = res->start >> PAGE_SHIFT;
+	pfn_last = pfn_first + (resource_size(res) >> PAGE_SHIFT);
+	kvmppc_uvmem_bitmap = bitmap_zalloc(pfn_last - pfn_first, GFP_KERNEL);
+	if (!kvmppc_uvmem_bitmap) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	pr_info("KVMPPC-UVMEM: Secure Memory size 0x%lx\n", size);
+	return ret;
+out_unmap:
+	memunmap_pages(&kvmppc_uvmem_pgmap);
+out_free_region:
+	release_mem_region(res->start, size);
+out:
+	return ret;
+}
+
+void kvmppc_uvmem_free(void)
+{
+	if (!kvmppc_uvmem_bitmap)
+		return;
+
+	memunmap_pages(&kvmppc_uvmem_pgmap);
+	release_mem_region(kvmppc_uvmem_pgmap.range.start,
+			   range_len(&kvmppc_uvmem_pgmap.range));
+	bitmap_free(kvmppc_uvmem_bitmap);
+}
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
new file mode 100644
index 0000000000..f4bec2fc51
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/asm-compat.h>
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+#ifdef CONFIG_PPC64_ELF_ABI_V2
+#define FUNC(name) 		name
+#else
+#define FUNC(name) 		GLUE(.,name)
+#endif
+#define GET_SHADOW_VCPU(reg)    addi	reg, r13, PACA_SVCPU
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+#define FUNC(name)		name
+#define GET_SHADOW_VCPU(reg)	lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#define VCPU_LOAD_NVGPRS(vcpu) \
+	PPC_LL	r14, VCPU_GPR(R14)(vcpu); \
+	PPC_LL	r15, VCPU_GPR(R15)(vcpu); \
+	PPC_LL	r16, VCPU_GPR(R16)(vcpu); \
+	PPC_LL	r17, VCPU_GPR(R17)(vcpu); \
+	PPC_LL	r18, VCPU_GPR(R18)(vcpu); \
+	PPC_LL	r19, VCPU_GPR(R19)(vcpu); \
+	PPC_LL	r20, VCPU_GPR(R20)(vcpu); \
+	PPC_LL	r21, VCPU_GPR(R21)(vcpu); \
+	PPC_LL	r22, VCPU_GPR(R22)(vcpu); \
+	PPC_LL	r23, VCPU_GPR(R23)(vcpu); \
+	PPC_LL	r24, VCPU_GPR(R24)(vcpu); \
+	PPC_LL	r25, VCPU_GPR(R25)(vcpu); \
+	PPC_LL	r26, VCPU_GPR(R26)(vcpu); \
+	PPC_LL	r27, VCPU_GPR(R27)(vcpu); \
+	PPC_LL	r28, VCPU_GPR(R28)(vcpu); \
+	PPC_LL	r29, VCPU_GPR(R29)(vcpu); \
+	PPC_LL	r30, VCPU_GPR(R30)(vcpu); \
+	PPC_LL	r31, VCPU_GPR(R31)(vcpu); \
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (highmem)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r3: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+
+kvm_start_entry:
+	/* Write correct stack frame */
+	mflr	r0
+	PPC_STL	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	PPC_STLU r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save r3 (vcpu) */
+	SAVE_GPR(3, r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save CR */
+	mfcr	r14
+	stw	r14, _CCR(r1)
+
+	/* Save LR */
+	PPC_STL	r0, _LINK(r1)
+
+	/* Load non-volatile guest state from the vcpu */
+	VCPU_LOAD_NVGPRS(r3)
+
+kvm_start_lightweight:
+	/* Copy registers into shadow vcpu so we can access them in real mode */
+	bl	FUNC(kvmppc_copy_to_svcpu)
+	nop
+	REST_GPR(3, r1)
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Get the dcbz32 flag */
+	PPC_LL	r0, VCPU_HFLAGS(r3)
+	rldicl	r0, r0, 0, 63		/* r3 &= 1 */
+	stb	r0, HSTATE_RESTORE_HID5(r13)
+
+	/* Load up guest SPRG3 value, since it's user readable */
+	lbz	r4, VCPU_SHAREDBE(r3)
+	cmpwi	r4, 0
+	ld	r5, VCPU_SHARED(r3)
+	beq	sprg3_little_endian
+sprg3_big_endian:
+#ifdef __BIG_ENDIAN__
+	ld	r4, VCPU_SHARED_SPRG3(r5)
+#else
+	addi	r5, r5, VCPU_SHARED_SPRG3
+	ldbrx	r4, 0, r5
+#endif
+	b	after_sprg3_load
+sprg3_little_endian:
+#ifdef __LITTLE_ENDIAN__
+	ld	r4, VCPU_SHARED_SPRG3(r5)
+#else
+	addi	r5, r5, VCPU_SHARED_SPRG3
+	ldbrx	r4, 0, r5
+#endif
+
+after_sprg3_load:
+	mtspr	SPRN_SPRG3, r4
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	PPC_LL	r4, VCPU_SHADOW_MSR(r3)	/* get shadow_msr */
+
+	/* Jump to segment patching handler and into our guest */
+	bl	FUNC(kvmppc_entry_trampoline)
+	nop
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 * SVCPU.*  = guest *
+	 * MSR.EE   = 1
+	 *
+	 */
+
+	PPC_LL	r3, GPR3(r1)		/* vcpu pointer */
+
+	/*
+	 * kvmppc_copy_from_svcpu can clobber volatile registers, save
+	 * the exit handler id to the vcpu and restore it from there later.
+	 */
+	stw	r12, VCPU_TRAP(r3)
+
+	/* Transfer reg values from shadow vcpu back to vcpu struct */
+
+	bl	FUNC(kvmppc_copy_from_svcpu)
+	nop
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * Reload kernel SPRG3 value.
+	 * No need to save guest value as usermode can't modify SPRG3.
+	 */
+	ld	r3, PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_SPRG_VDSO_WRITE, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/* R7 = vcpu */
+	PPC_LL	r7, GPR3(r1)
+
+	PPC_STL	r14, VCPU_GPR(R14)(r7)
+	PPC_STL	r15, VCPU_GPR(R15)(r7)
+	PPC_STL	r16, VCPU_GPR(R16)(r7)
+	PPC_STL	r17, VCPU_GPR(R17)(r7)
+	PPC_STL	r18, VCPU_GPR(R18)(r7)
+	PPC_STL	r19, VCPU_GPR(R19)(r7)
+	PPC_STL	r20, VCPU_GPR(R20)(r7)
+	PPC_STL	r21, VCPU_GPR(R21)(r7)
+	PPC_STL	r22, VCPU_GPR(R22)(r7)
+	PPC_STL	r23, VCPU_GPR(R23)(r7)
+	PPC_STL	r24, VCPU_GPR(R24)(r7)
+	PPC_STL	r25, VCPU_GPR(R25)(r7)
+	PPC_STL	r26, VCPU_GPR(R26)(r7)
+	PPC_STL	r27, VCPU_GPR(R27)(r7)
+	PPC_STL	r28, VCPU_GPR(R28)(r7)
+	PPC_STL	r29, VCPU_GPR(R29)(r7)
+	PPC_STL	r30, VCPU_GPR(R30)(r7)
+	PPC_STL	r31, VCPU_GPR(R31)(r7)
+
+	/* Pass the exit number as 2nd argument to kvmppc_handle_exit */
+	lwz	r4, VCPU_TRAP(r7)
+
+	/* Restore r3 (vcpu) */
+	REST_GPR(3, r1)
+	bl	FUNC(kvmppc_handle_exit_pr)
+
+	/* If RESUME_GUEST, get back in the loop */
+	cmpwi	r3, RESUME_GUEST
+	beq	kvm_loop_lightweight
+
+	cmpwi	r3, RESUME_GUEST_NV
+	beq	kvm_loop_heavyweight
+
+kvm_exit_loop:
+
+	PPC_LL	r4, _LINK(r1)
+	mtlr	r4
+
+	lwz	r14, _CCR(r1)
+	mtcr	r14
+
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	blr
+
+kvm_loop_heavyweight:
+
+	PPC_LL	r4, _LINK(r1)
+	PPC_STL r4, (PPC_LR_STKOFF + SWITCH_FRAME_SIZE)(r1)
+
+	/* Load vcpu */
+	REST_GPR(3, r1)
+
+	/* Load non-volatile guest state from the vcpu */
+	VCPU_LOAD_NVGPRS(r3)
+
+	/* Jump back into the beginning of this function */
+	b	kvm_start_lightweight
+
+kvm_loop_lightweight:
+
+	/* We'll need the vcpu pointer */
+	REST_GPR(3, r1)
+
+	/* Jump back into the beginning of this function */
+	b	kvm_start_lightweight
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 0000000000..ce79ac33e8
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#include "trace_pr.h"
+
+#define PTE_SIZE	12
+
+static struct kmem_cache *hpte_cache;
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
+{
+	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
+}
+
+static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr)
+{
+	return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE,
+		       HPTEG_HASH_BITS_PTE_LONG);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
+{
+	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
+{
+	return hash_64((vpage & 0xffffff000ULL) >> 12,
+		       HPTEG_HASH_BITS_VPTE_LONG);
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage)
+{
+	return hash_64((vpage & 0xffffffff0ULL) >> 4,
+		       HPTEG_HASH_BITS_VPTE_64K);
+}
+#endif
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	u64 index;
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
+	trace_kvm_book3s_mmu_map(pte);
+
+	spin_lock(&vcpu3s->mmu_lock);
+
+	/* Add to ePTE list */
+	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+	hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);
+
+	/* Add to ePTE_long list */
+	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
+	hlist_add_head_rcu(&pte->list_pte_long,
+			   &vcpu3s->hpte_hash_pte_long[index]);
+
+	/* Add to vPTE list */
+	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+	hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);
+
+	/* Add to vPTE_long list */
+	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+	hlist_add_head_rcu(&pte->list_vpte_long,
+			   &vcpu3s->hpte_hash_vpte_long[index]);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Add to vPTE_64k list */
+	index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage);
+	hlist_add_head_rcu(&pte->list_vpte_64k,
+			   &vcpu3s->hpte_hash_vpte_64k[index]);
+#endif
+
+	vcpu3s->hpte_cache_count++;
+
+	spin_unlock(&vcpu3s->mmu_lock);
+}
+
+static void free_pte_rcu(struct rcu_head *head)
+{
+	struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head);
+	kmem_cache_free(hpte_cache, pte);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
+	trace_kvm_book3s_mmu_invalidate(pte);
+
+	/* Different for 32 and 64 bit */
+	kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+	spin_lock(&vcpu3s->mmu_lock);
+
+	/* pte already invalidated in between? */
+	if (hlist_unhashed(&pte->list_pte)) {
+		spin_unlock(&vcpu3s->mmu_lock);
+		return;
+	}
+
+	hlist_del_init_rcu(&pte->list_pte);
+	hlist_del_init_rcu(&pte->list_pte_long);
+	hlist_del_init_rcu(&pte->list_vpte);
+	hlist_del_init_rcu(&pte->list_vpte_long);
+#ifdef CONFIG_PPC_BOOK3S_64
+	hlist_del_init_rcu(&pte->list_vpte_64k);
+#endif
+	vcpu3s->hpte_cache_count--;
+
+	spin_unlock(&vcpu3s->mmu_lock);
+
+	call_rcu(&pte->rcu_head, free_pte_rcu);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hpte_cache *pte;
+	int i;
+
+	rcu_read_lock();
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_rcu(pte, list, list_vpte_long)
+			invalidate_pte(vcpu, pte);
+	}
+
+	rcu_read_unlock();
+}
+
+static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_pte)
+		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu3s->hpte_hash_pte_long[
+			kvmppc_mmu_hash_pte_long(guest_ea)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_pte_long)
+		if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask);
+	guest_ea &= ea_mask;
+
+	switch (ea_mask) {
+	case ~0xfffUL:
+		kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
+		break;
+	case 0x0ffff000:
+		kvmppc_mmu_pte_flush_long(vcpu, guest_ea);
+		break;
+	case 0:
+		/* Doing a complete flush -> start from scratch */
+		kvmppc_mmu_pte_flush_all(vcpu);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xfffffffffULL;
+
+	list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_vpte)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Flush with mask 0xffffffff0 */
+static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xffffffff0ULL;
+
+	list = &vcpu3s->hpte_hash_vpte_64k[
+		kvmppc_mmu_hash_vpte_64k(guest_vp)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_vpte_64k)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+#endif
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xffffff000ULL;
+
+	list = &vcpu3s->hpte_hash_vpte_long[
+		kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_vpte_long)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask);
+	guest_vp &= vp_mask;
+
+	switch(vp_mask) {
+	case 0xfffffffffULL:
+		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case 0xffffffff0ULL:
+		kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp);
+		break;
+#endif
+	case 0xffffff000ULL:
+		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hpte_cache *pte;
+	int i;
+
+	trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end);
+
+	rcu_read_lock();
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_rcu(pte, list, list_vpte_long)
+			if ((pte->pte.raddr >= pa_start) &&
+			    (pte->pte.raddr < pa_end))
+				invalidate_pte(vcpu, pte);
+	}
+
+	rcu_read_unlock();
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hpte_cache *pte;
+
+	if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush_all(vcpu);
+
+	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
+
+	return pte;
+}
+
+void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte)
+{
+	kmem_cache_free(hpte_cache, pte);
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		INIT_HLIST_HEAD(&hash_list[i]);
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte_long));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
+#ifdef CONFIG_PPC_BOOK3S_64
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k));
+#endif
+
+	spin_lock_init(&vcpu3s->mmu_lock);
+
+	return 0;
+}
+
+int kvmppc_mmu_hpte_sysinit(void)
+{
+	/* init hpte slab cache */
+	hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
+				       sizeof(struct hpte_cache), 0, NULL);
+
+	return 0;
+}
+
+void kvmppc_mmu_hpte_sysexit(void)
+{
+	kmem_cache_destroy(hpte_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
new file mode 100644
index 0000000000..bc39c76c9d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -0,0 +1,1263 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright Novell Inc 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_fpu.h>
+#include <asm/reg.h>
+#include <asm/cacheflush.h>
+#include <asm/switch_to.h>
+#include <linux/vmalloc.h>
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+#define dprintk printk
+#else
+#define dprintk(...) do { } while(0);
+#endif
+
+#define OP_LFS			48
+#define OP_LFSU			49
+#define OP_LFD			50
+#define OP_LFDU			51
+#define OP_STFS			52
+#define OP_STFSU		53
+#define OP_STFD			54
+#define OP_STFDU		55
+#define OP_PSQ_L		56
+#define OP_PSQ_LU		57
+#define OP_PSQ_ST		60
+#define OP_PSQ_STU		61
+
+#define OP_31_LFSX		535
+#define OP_31_LFSUX		567
+#define OP_31_LFDX		599
+#define OP_31_LFDUX		631
+#define OP_31_STFSX		663
+#define OP_31_STFSUX		695
+#define OP_31_STFX		727
+#define OP_31_STFUX		759
+#define OP_31_LWIZX		887
+#define OP_31_STFIWX		983
+
+#define OP_59_FADDS		21
+#define OP_59_FSUBS		20
+#define OP_59_FSQRTS		22
+#define OP_59_FDIVS		18
+#define OP_59_FRES		24
+#define OP_59_FMULS		25
+#define OP_59_FRSQRTES		26
+#define OP_59_FMSUBS		28
+#define OP_59_FMADDS		29
+#define OP_59_FNMSUBS		30
+#define OP_59_FNMADDS		31
+
+#define OP_63_FCMPU		0
+#define OP_63_FCPSGN		8
+#define OP_63_FRSP		12
+#define OP_63_FCTIW		14
+#define OP_63_FCTIWZ		15
+#define OP_63_FDIV		18
+#define OP_63_FADD		21
+#define OP_63_FSQRT		22
+#define OP_63_FSEL		23
+#define OP_63_FRE		24
+#define OP_63_FMUL		25
+#define OP_63_FRSQRTE		26
+#define OP_63_FMSUB		28
+#define OP_63_FMADD		29
+#define OP_63_FNMSUB		30
+#define OP_63_FNMADD		31
+#define OP_63_FCMPO		32
+#define OP_63_MTFSB1		38 // XXX
+#define OP_63_FSUB		20
+#define OP_63_FNEG		40
+#define OP_63_MCRFS		64
+#define OP_63_MTFSB0		70
+#define OP_63_FMR		72
+#define OP_63_MTFSFI		134
+#define OP_63_FABS		264
+#define OP_63_MFFS		583
+#define OP_63_MTFSF		711
+
+#define OP_4X_PS_CMPU0		0
+#define OP_4X_PSQ_LX		6
+#define OP_4XW_PSQ_STX		7
+#define OP_4A_PS_SUM0		10
+#define OP_4A_PS_SUM1		11
+#define OP_4A_PS_MULS0		12
+#define OP_4A_PS_MULS1		13
+#define OP_4A_PS_MADDS0		14
+#define OP_4A_PS_MADDS1		15
+#define OP_4A_PS_DIV		18
+#define OP_4A_PS_SUB		20
+#define OP_4A_PS_ADD		21
+#define OP_4A_PS_SEL		23
+#define OP_4A_PS_RES		24
+#define OP_4A_PS_MUL		25
+#define OP_4A_PS_RSQRTE		26
+#define OP_4A_PS_MSUB		28
+#define OP_4A_PS_MADD		29
+#define OP_4A_PS_NMSUB		30
+#define OP_4A_PS_NMADD		31
+#define OP_4X_PS_CMPO0		32
+#define OP_4X_PSQ_LUX		38
+#define OP_4XW_PSQ_STUX		39
+#define OP_4X_PS_NEG		40
+#define OP_4X_PS_CMPU1		64
+#define OP_4X_PS_MR		72
+#define OP_4X_PS_CMPO1		96
+#define OP_4X_PS_NABS		136
+#define OP_4X_PS_ABS		264
+#define OP_4X_PS_MERGE00	528
+#define OP_4X_PS_MERGE01	560
+#define OP_4X_PS_MERGE10	592
+#define OP_4X_PS_MERGE11	624
+
+#define SCALAR_NONE		0
+#define SCALAR_HIGH		(1 << 0)
+#define SCALAR_LOW		(1 << 1)
+#define SCALAR_NO_PS0		(1 << 2)
+#define SCALAR_NO_PS1		(1 << 3)
+
+#define GQR_ST_TYPE_MASK	0x00000007
+#define GQR_ST_TYPE_SHIFT	0
+#define GQR_ST_SCALE_MASK	0x00003f00
+#define GQR_ST_SCALE_SHIFT	8
+#define GQR_LD_TYPE_MASK	0x00070000
+#define GQR_LD_TYPE_SHIFT	16
+#define GQR_LD_SCALE_MASK	0x3f000000
+#define GQR_LD_SCALE_SHIFT	24
+
+#define GQR_QUANTIZE_FLOAT	0
+#define GQR_QUANTIZE_U8		4
+#define GQR_QUANTIZE_U16	5
+#define GQR_QUANTIZE_S8		6
+#define GQR_QUANTIZE_S16	7
+
+#define FPU_LS_SINGLE		0
+#define FPU_LS_DOUBLE		1
+#define FPU_LS_SINGLE_LOW	2
+
+static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
+{
+	kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]);
+}
+
+static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
+{
+	u32 dsisr;
+	u64 msr = kvmppc_get_msr(vcpu);
+
+	msr = kvmppc_set_field(msr, 33, 36, 0);
+	msr = kvmppc_set_field(msr, 42, 47, 0);
+	kvmppc_set_msr(vcpu, msr);
+	kvmppc_set_dar(vcpu, eaddr);
+	/* Page Fault */
+	dsisr = kvmppc_set_field(0, 33, 33, 1);
+	if (is_store)
+		dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+	kvmppc_set_dsisr(vcpu, dsisr);
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
+}
+
+static int kvmppc_emulate_fpr_load(struct kvm_vcpu *vcpu,
+				   int rs, ulong addr, int ls_type)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	char tmp[8];
+	int len = sizeof(u32);
+
+	if (ls_type == FPU_LS_DOUBLE)
+		len = sizeof(u64);
+
+	/* read from memory */
+	r = kvmppc_ld(vcpu, &addr, len, tmp, true);
+	vcpu->arch.paddr_accessed = addr;
+
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, false);
+		goto done_load;
+	} else if (r == EMULATE_DO_MMIO) {
+		emulated = kvmppc_handle_load(vcpu, KVM_MMIO_REG_FPR | rs,
+					      len, 1);
+		goto done_load;
+	}
+
+	emulated = EMULATE_DONE;
+
+	/* put in registers */
+	switch (ls_type) {
+	case FPU_LS_SINGLE:
+		kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs));
+		vcpu->arch.qpr[rs] = *((u32*)tmp);
+		break;
+	case FPU_LS_DOUBLE:
+		VCPU_FPR(vcpu, rs) = *((u64*)tmp);
+		break;
+	}
+
+	dprintk(KERN_INFO "KVM: FPR_LD [0x%llx] at 0x%lx (%d)\n", *(u64*)tmp,
+			  addr, len);
+
+done_load:
+	return emulated;
+}
+
+static int kvmppc_emulate_fpr_store(struct kvm_vcpu *vcpu,
+				    int rs, ulong addr, int ls_type)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	char tmp[8];
+	u64 val;
+	int len;
+
+	switch (ls_type) {
+	case FPU_LS_SINGLE:
+		kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp);
+		val = *((u32*)tmp);
+		len = sizeof(u32);
+		break;
+	case FPU_LS_SINGLE_LOW:
+		*((u32*)tmp) = VCPU_FPR(vcpu, rs);
+		val = VCPU_FPR(vcpu, rs) & 0xffffffff;
+		len = sizeof(u32);
+		break;
+	case FPU_LS_DOUBLE:
+		*((u64*)tmp) = VCPU_FPR(vcpu, rs);
+		val = VCPU_FPR(vcpu, rs);
+		len = sizeof(u64);
+		break;
+	default:
+		val = 0;
+		len = 0;
+	}
+
+	r = kvmppc_st(vcpu, &addr, len, tmp, true);
+	vcpu->arch.paddr_accessed = addr;
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, true);
+	} else if (r == EMULATE_DO_MMIO) {
+		emulated = kvmppc_handle_store(vcpu, val, len, 1);
+	} else {
+		emulated = EMULATE_DONE;
+	}
+
+	dprintk(KERN_INFO "KVM: FPR_ST [0x%llx] at 0x%lx (%d)\n",
+			  val, addr, len);
+
+	return emulated;
+}
+
+static int kvmppc_emulate_psq_load(struct kvm_vcpu *vcpu,
+				   int rs, ulong addr, bool w, int i)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	float one = 1.0;
+	u32 tmp[2];
+
+	/* read from memory */
+	if (w) {
+		r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
+		memcpy(&tmp[1], &one, sizeof(u32));
+	} else {
+		r = kvmppc_ld(vcpu, &addr, sizeof(u32) * 2, tmp, true);
+	}
+	vcpu->arch.paddr_accessed = addr;
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, false);
+		goto done_load;
+	} else if ((r == EMULATE_DO_MMIO) && w) {
+		emulated = kvmppc_handle_load(vcpu, KVM_MMIO_REG_FPR | rs,
+					      4, 1);
+		vcpu->arch.qpr[rs] = tmp[1];
+		goto done_load;
+	} else if (r == EMULATE_DO_MMIO) {
+		emulated = kvmppc_handle_load(vcpu, KVM_MMIO_REG_FQPR | rs,
+					      8, 1);
+		goto done_load;
+	}
+
+	emulated = EMULATE_DONE;
+
+	/* put in registers */
+	kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs));
+	vcpu->arch.qpr[rs] = tmp[1];
+
+	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
+			  tmp[1], addr, w ? 4 : 8);
+
+done_load:
+	return emulated;
+}
+
+static int kvmppc_emulate_psq_store(struct kvm_vcpu *vcpu,
+				    int rs, ulong addr, bool w, int i)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	u32 tmp[2];
+	int len = w ? sizeof(u32) : sizeof(u64);
+
+	kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]);
+	tmp[1] = vcpu->arch.qpr[rs];
+
+	r = kvmppc_st(vcpu, &addr, len, tmp, true);
+	vcpu->arch.paddr_accessed = addr;
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, true);
+	} else if ((r == EMULATE_DO_MMIO) && w) {
+		emulated = kvmppc_handle_store(vcpu, tmp[0], 4, 1);
+	} else if (r == EMULATE_DO_MMIO) {
+		u64 val = ((u64)tmp[0] << 32) | tmp[1];
+		emulated = kvmppc_handle_store(vcpu, val, 8, 1);
+	} else {
+		emulated = EMULATE_DONE;
+	}
+
+	dprintk(KERN_INFO "KVM: PSQ_ST [0x%x, 0x%x] at 0x%lx (%d)\n",
+			  tmp[0], tmp[1], addr, len);
+
+	return emulated;
+}
+
+/*
+ * Cuts out inst bits with ordering according to spec.
+ * That means the leftmost bit is zero. All given bits are included.
+ */
+static inline u32 inst_get_field(u32 inst, int msb, int lsb)
+{
+	return kvmppc_get_field(inst, msb + 32, lsb + 32);
+}
+
+static bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+{
+	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+		return false;
+
+	switch (get_op(inst)) {
+	case OP_PSQ_L:
+	case OP_PSQ_LU:
+	case OP_PSQ_ST:
+	case OP_PSQ_STU:
+	case OP_LFS:
+	case OP_LFSU:
+	case OP_LFD:
+	case OP_LFDU:
+	case OP_STFS:
+	case OP_STFSU:
+	case OP_STFD:
+	case OP_STFDU:
+		return true;
+	case 4:
+		/* X form */
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_4X_PS_CMPU0:
+		case OP_4X_PSQ_LX:
+		case OP_4X_PS_CMPO0:
+		case OP_4X_PSQ_LUX:
+		case OP_4X_PS_NEG:
+		case OP_4X_PS_CMPU1:
+		case OP_4X_PS_MR:
+		case OP_4X_PS_CMPO1:
+		case OP_4X_PS_NABS:
+		case OP_4X_PS_ABS:
+		case OP_4X_PS_MERGE00:
+		case OP_4X_PS_MERGE01:
+		case OP_4X_PS_MERGE10:
+		case OP_4X_PS_MERGE11:
+			return true;
+		}
+		/* XW form */
+		switch (inst_get_field(inst, 25, 30)) {
+		case OP_4XW_PSQ_STX:
+		case OP_4XW_PSQ_STUX:
+			return true;
+		}
+		/* A form */
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_4A_PS_SUM1:
+		case OP_4A_PS_SUM0:
+		case OP_4A_PS_MULS0:
+		case OP_4A_PS_MULS1:
+		case OP_4A_PS_MADDS0:
+		case OP_4A_PS_MADDS1:
+		case OP_4A_PS_DIV:
+		case OP_4A_PS_SUB:
+		case OP_4A_PS_ADD:
+		case OP_4A_PS_SEL:
+		case OP_4A_PS_RES:
+		case OP_4A_PS_MUL:
+		case OP_4A_PS_RSQRTE:
+		case OP_4A_PS_MSUB:
+		case OP_4A_PS_MADD:
+		case OP_4A_PS_NMSUB:
+		case OP_4A_PS_NMADD:
+			return true;
+		}
+		break;
+	case 59:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_59_FADDS:
+		case OP_59_FSUBS:
+		case OP_59_FDIVS:
+		case OP_59_FRES:
+		case OP_59_FRSQRTES:
+			return true;
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_59_FMULS:
+		case OP_59_FMSUBS:
+		case OP_59_FMADDS:
+		case OP_59_FNMSUBS:
+		case OP_59_FNMADDS:
+			return true;
+		}
+		break;
+	case 63:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_63_MTFSB0:
+		case OP_63_MTFSB1:
+		case OP_63_MTFSF:
+		case OP_63_MTFSFI:
+		case OP_63_MCRFS:
+		case OP_63_MFFS:
+		case OP_63_FCMPU:
+		case OP_63_FCMPO:
+		case OP_63_FNEG:
+		case OP_63_FMR:
+		case OP_63_FABS:
+		case OP_63_FRSP:
+		case OP_63_FDIV:
+		case OP_63_FADD:
+		case OP_63_FSUB:
+		case OP_63_FCTIW:
+		case OP_63_FCTIWZ:
+		case OP_63_FRSQRTE:
+		case OP_63_FCPSGN:
+			return true;
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_63_FMUL:
+		case OP_63_FSEL:
+		case OP_63_FMSUB:
+		case OP_63_FMADD:
+		case OP_63_FNMSUB:
+		case OP_63_FNMADD:
+			return true;
+		}
+		break;
+	case 31:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_31_LFSX:
+		case OP_31_LFSUX:
+		case OP_31_LFDX:
+		case OP_31_LFDUX:
+		case OP_31_STFSX:
+		case OP_31_STFSUX:
+		case OP_31_STFX:
+		case OP_31_STFUX:
+		case OP_31_STFIWX:
+			return true;
+		}
+		break;
+	}
+
+	return false;
+}
+
+static int get_d_signext(u32 inst)
+{
+	int d = inst & 0x8ff;
+
+	if (d & 0x800)
+		return -(d & 0x7ff);
+
+	return (d & 0x7ff);
+}
+
+static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
+				      int reg_out, int reg_in1, int reg_in2,
+				      int reg_in3, int scalar,
+				      void (*func)(u64 *fpscr,
+						 u32 *dst, u32 *src1,
+						 u32 *src2, u32 *src3))
+{
+	u32 *qpr = vcpu->arch.qpr;
+	u32 ps0_out;
+	u32 ps0_in1, ps0_in2, ps0_in3;
+	u32 ps1_in1, ps1_in2, ps1_in3;
+
+	/* RC */
+	WARN_ON(rc);
+
+	/* PS0 */
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3);
+
+	if (scalar & SCALAR_LOW)
+		ps0_in2 = qpr[reg_in2];
+
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+
+	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
+			  ps0_in1, ps0_in2, ps0_in3, ps0_out);
+
+	if (!(scalar & SCALAR_NO_PS0))
+		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
+
+	/* PS1 */
+	ps1_in1 = qpr[reg_in1];
+	ps1_in2 = qpr[reg_in2];
+	ps1_in3 = qpr[reg_in3];
+
+	if (scalar & SCALAR_HIGH)
+		ps1_in2 = ps0_in2;
+
+	if (!(scalar & SCALAR_NO_PS1))
+		func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+
+	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
+			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
+				    int reg_out, int reg_in1, int reg_in2,
+				    int scalar,
+				    void (*func)(u64 *fpscr,
+						 u32 *dst, u32 *src1,
+						 u32 *src2))
+{
+	u32 *qpr = vcpu->arch.qpr;
+	u32 ps0_out;
+	u32 ps0_in1, ps0_in2;
+	u32 ps1_out;
+	u32 ps1_in1, ps1_in2;
+
+	/* RC */
+	WARN_ON(rc);
+
+	/* PS0 */
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);
+
+	if (scalar & SCALAR_LOW)
+		ps0_in2 = qpr[reg_in2];
+	else
+		kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2);
+
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
+
+	if (!(scalar & SCALAR_NO_PS0)) {
+		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
+				  ps0_in1, ps0_in2, ps0_out);
+
+		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
+	}
+
+	/* PS1 */
+	ps1_in1 = qpr[reg_in1];
+	ps1_in2 = qpr[reg_in2];
+
+	if (scalar & SCALAR_HIGH)
+		ps1_in2 = ps0_in2;
+
+	func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
+
+	if (!(scalar & SCALAR_NO_PS1)) {
+		qpr[reg_out] = ps1_out;
+
+		dprintk(KERN_INFO "PS2 ps1 -> f(0x%x, 0x%x) = 0x%x\n",
+				  ps1_in1, ps1_in2, qpr[reg_out]);
+	}
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
+				    int reg_out, int reg_in,
+				    void (*func)(u64 *t,
+						 u32 *dst, u32 *src1))
+{
+	u32 *qpr = vcpu->arch.qpr;
+	u32 ps0_out, ps0_in;
+	u32 ps1_in;
+
+	/* RC */
+	WARN_ON(rc);
+
+	/* PS0 */
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in);
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in);
+
+	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
+			  ps0_in, ps0_out);
+
+	kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
+
+	/* PS1 */
+	ps1_in = qpr[reg_in];
+	func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in);
+
+	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
+			  ps1_in, qpr[reg_out]);
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_emulate_paired_single(struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+	ppc_inst_t pinst;
+	enum emulation_result emulated = EMULATE_DONE;
+	int ax_rd, ax_ra, ax_rb, ax_rc;
+	short full_d;
+	u64 *fpr_d, *fpr_a, *fpr_b, *fpr_c;
+
+	bool rcomp;
+	u32 cr;
+#ifdef DEBUG
+	int i;
+#endif
+
+	emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst);
+	inst = ppc_inst_val(pinst);
+	if (emulated != EMULATE_DONE)
+		return emulated;
+
+	ax_rd = inst_get_field(inst, 6, 10);
+	ax_ra = inst_get_field(inst, 11, 15);
+	ax_rb = inst_get_field(inst, 16, 20);
+	ax_rc = inst_get_field(inst, 21, 25);
+	full_d = inst_get_field(inst, 16, 31);
+
+	fpr_d = &VCPU_FPR(vcpu, ax_rd);
+	fpr_a = &VCPU_FPR(vcpu, ax_ra);
+	fpr_b = &VCPU_FPR(vcpu, ax_rb);
+	fpr_c = &VCPU_FPR(vcpu, ax_rc);
+
+	rcomp = (inst & 1) ? true : false;
+	cr = kvmppc_get_cr(vcpu);
+
+	if (!kvmppc_inst_is_paired_single(vcpu, inst))
+		return EMULATE_FAIL;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
+		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
+		return EMULATE_AGAIN;
+	}
+
+	kvmppc_giveup_ext(vcpu, MSR_FP);
+	preempt_disable();
+	enable_kernel_fp();
+	/* Do we need to clear FE0 / FE1 here? Don't think so. */
+
+#ifdef DEBUG
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {
+		u32 f;
+		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);
+		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
+			i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]);
+	}
+#endif
+
+	switch (get_op(inst)) {
+	case OP_PSQ_L:
+	{
+		ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_load(vcpu, ax_rd, addr, w, i);
+		break;
+	}
+	case OP_PSQ_LU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_load(vcpu, ax_rd, addr, w, i);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_PSQ_ST:
+	{
+		ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_store(vcpu, ax_rd, addr, w, i);
+		break;
+	}
+	case OP_PSQ_STU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_store(vcpu, ax_rd, addr, w, i);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case 4:
+		/* X form */
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_4X_PS_CMPU0:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PSQ_LX:
+		{
+			ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_load(vcpu, ax_rd, addr, w, i);
+			break;
+		}
+		case OP_4X_PS_CMPO0:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PSQ_LUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_load(vcpu, ax_rd, addr, w, i);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_4X_PS_NEG:
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL;
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			vcpu->arch.qpr[ax_rd] ^= 0x80000000;
+			break;
+		case OP_4X_PS_CMPU1:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PS_MR:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			break;
+		case OP_4X_PS_CMPO1:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PS_NABS:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL;
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			vcpu->arch.qpr[ax_rd] |= 0x80000000;
+			break;
+		case OP_4X_PS_ABS:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL;
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			vcpu->arch.qpr[ax_rd] &= ~0x80000000;
+			break;
+		case OP_4X_PS_MERGE00:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);
+			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */
+			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),
+				   &vcpu->arch.qpr[ax_rd]);
+			break;
+		case OP_4X_PS_MERGE01:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			break;
+		case OP_4X_PS_MERGE10:
+			WARN_ON(rcomp);
+			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &VCPU_FPR(vcpu, ax_rd));
+			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */
+			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),
+				   &vcpu->arch.qpr[ax_rd]);
+			break;
+		case OP_4X_PS_MERGE11:
+			WARN_ON(rcomp);
+			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &VCPU_FPR(vcpu, ax_rd));
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			break;
+		}
+		/* XW form */
+		switch (inst_get_field(inst, 25, 30)) {
+		case OP_4XW_PSQ_STX:
+		{
+			ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_store(vcpu, ax_rd, addr, w, i);
+			break;
+		}
+		case OP_4XW_PSQ_STUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_store(vcpu, ax_rd, addr, w, i);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		}
+		/* A form */
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_4A_PS_SUM1:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc);
+			break;
+		case OP_4A_PS_SUM0:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NO_PS1 | SCALAR_LOW, fps_fadds);
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rc];
+			break;
+		case OP_4A_PS_MULS0:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, SCALAR_HIGH, fps_fmuls);
+			break;
+		case OP_4A_PS_MULS1:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, SCALAR_LOW, fps_fmuls);
+			break;
+		case OP_4A_PS_MADDS0:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_HIGH, fps_fmadds);
+			break;
+		case OP_4A_PS_MADDS1:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_LOW, fps_fmadds);
+			break;
+		case OP_4A_PS_DIV:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NONE, fps_fdivs);
+			break;
+		case OP_4A_PS_SUB:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NONE, fps_fsubs);
+			break;
+		case OP_4A_PS_ADD:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NONE, fps_fadds);
+			break;
+		case OP_4A_PS_SEL:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fsel);
+			break;
+		case OP_4A_PS_RES:
+			emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd,
+					ax_rb, fps_fres);
+			break;
+		case OP_4A_PS_MUL:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, SCALAR_NONE, fps_fmuls);
+			break;
+		case OP_4A_PS_RSQRTE:
+			emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd,
+					ax_rb, fps_frsqrte);
+			break;
+		case OP_4A_PS_MSUB:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmsubs);
+			break;
+		case OP_4A_PS_MADD:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmadds);
+			break;
+		case OP_4A_PS_NMSUB:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmsubs);
+			break;
+		case OP_4A_PS_NMADD:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmadds);
+			break;
+		}
+		break;
+
+	/* Real FPU operations */
+
+	case OP_LFS:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd, addr,
+						   FPU_LS_SINGLE);
+		break;
+	}
+	case OP_LFSU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd, addr,
+						   FPU_LS_SINGLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_LFD:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd, addr,
+						   FPU_LS_DOUBLE);
+		break;
+	}
+	case OP_LFDU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd, addr,
+						   FPU_LS_DOUBLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_STFS:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd, addr,
+						    FPU_LS_SINGLE);
+		break;
+	}
+	case OP_STFSU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd, addr,
+						    FPU_LS_SINGLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_STFD:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd, addr,
+						    FPU_LS_DOUBLE);
+		break;
+	}
+	case OP_STFDU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd, addr,
+						    FPU_LS_DOUBLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case 31:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_31_LFSX:
+		{
+			ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd,
+							   addr, FPU_LS_SINGLE);
+			break;
+		}
+		case OP_31_LFSUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd,
+							   addr, FPU_LS_SINGLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_LFDX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd,
+							   addr, FPU_LS_DOUBLE);
+			break;
+		}
+		case OP_31_LFDUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_load(vcpu, ax_rd,
+							   addr, FPU_LS_DOUBLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_STFSX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd,
+							    addr, FPU_LS_SINGLE);
+			break;
+		}
+		case OP_31_STFSUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd,
+							    addr, FPU_LS_SINGLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_STFX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd,
+							    addr, FPU_LS_DOUBLE);
+			break;
+		}
+		case OP_31_STFUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd,
+							    addr, FPU_LS_DOUBLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_STFIWX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(vcpu, ax_rd,
+							    addr,
+							    FPU_LS_SINGLE_LOW);
+			break;
+		}
+			break;
+		}
+		break;
+	case 59:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_59_FADDS:
+			fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FSUBS:
+			fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FDIVS:
+			fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FRES:
+			fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FRSQRTES:
+			fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_59_FMULS:
+			fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FMSUBS:
+			fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FMADDS:
+			fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FNMSUBS:
+			fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FNMADDS:
+			fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		}
+		break;
+	case 63:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_63_MTFSB0:
+		case OP_63_MTFSB1:
+		case OP_63_MCRFS:
+		case OP_63_MTFSFI:
+			/* XXX need to implement */
+			break;
+		case OP_63_MFFS:
+			/* XXX missing CR */
+			*fpr_d = vcpu->arch.fp.fpscr;
+			break;
+		case OP_63_MTFSF:
+			/* XXX missing fm bits */
+			/* XXX missing CR */
+			vcpu->arch.fp.fpscr = *fpr_b;
+			break;
+		case OP_63_FCMPU:
+		{
+			u32 tmp_cr;
+			u32 cr0_mask = 0xf0000000;
+			u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
+
+			fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);
+			cr &= ~(cr0_mask >> cr_shift);
+			cr |= (cr & cr0_mask) >> cr_shift;
+			break;
+		}
+		case OP_63_FCMPO:
+		{
+			u32 tmp_cr;
+			u32 cr0_mask = 0xf0000000;
+			u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
+
+			fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);
+			cr &= ~(cr0_mask >> cr_shift);
+			cr |= (cr & cr0_mask) >> cr_shift;
+			break;
+		}
+		case OP_63_FNEG:
+			fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FMR:
+			*fpr_d = *fpr_b;
+			break;
+		case OP_63_FABS:
+			fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FCPSGN:
+			fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FDIV:
+			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FADD:
+			fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FSUB:
+			fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FCTIW:
+			fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FCTIWZ:
+			fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FRSP:
+			fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_63_FRSQRTE:
+		{
+			double one = 1.0f;
+
+			/* fD = sqrt(fB) */
+			fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			/* fD = 1.0f / fD */
+			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d);
+			break;
+		}
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_63_FMUL:
+			fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+			break;
+		case OP_63_FSEL:
+			fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FMSUB:
+			fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FMADD:
+			fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FNMSUB:
+			fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FNMADD:
+			fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		}
+		break;
+	}
+
+#ifdef DEBUG
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {
+		u32 f;
+		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);
+		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
+	}
+#endif
+
+	if (rcomp)
+		kvmppc_set_cr(vcpu, cr);
+
+	disable_kernel_fp();
+	preempt_enable();
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
new file mode 100644
index 0000000000..9118242063
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -0,0 +1,2123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ * Functions relating to running KVM on Book 3S processors where
+ * we don't have access to hypervisor mode, and we run the guest
+ * in problem state (user mode).
+ *
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <linux/uaccess.h>
+#include <asm/interrupt.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/switch_to.h>
+#include <asm/firmware.h>
+#include <asm/setup.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <asm/asm-prototypes.h>
+#include <asm/tm.h>
+
+#include "book3s.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_pr.h"
+
+/* #define EXIT_DEBUG */
+/* #define DEBUG_EXT */
+
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr);
+#ifdef CONFIG_PPC_BOOK3S_64
+static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac);
+#endif
+
+/* Some compatibility defines */
+#ifdef CONFIG_PPC_BOOK3S_32
+#define MSR_USER32 MSR_USER
+#define MSR_USER64 MSR_USER
+#define HW_PAGE_SIZE PAGE_SIZE
+#define HPTE_R_M   _PAGE_COHERENT
+#endif
+
+static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu)
+{
+	ulong msr = kvmppc_get_msr(vcpu);
+	return (msr & (MSR_IR|MSR_DR)) == MSR_DR;
+}
+
+static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu)
+{
+	ulong msr = kvmppc_get_msr(vcpu);
+	ulong pc = kvmppc_get_pc(vcpu);
+
+	/* We are in DR only split real mode */
+	if ((msr & (MSR_IR|MSR_DR)) != MSR_DR)
+		return;
+
+	/* We have not fixed up the guest already */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK)
+		return;
+
+	/* The code is in fixupable address space */
+	if (pc & SPLIT_HACK_MASK)
+		return;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SPLIT_HACK;
+	kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS);
+}
+
+static void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
+		ulong pc = kvmppc_get_pc(vcpu);
+		ulong lr = kvmppc_get_lr(vcpu);
+		if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+			kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
+		if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+			kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
+		vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
+	}
+}
+
+static void kvmppc_inject_interrupt_pr(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags)
+{
+	unsigned long msr, pc, new_msr, new_pc;
+
+	kvmppc_unfixup_split_real(vcpu);
+
+	msr = kvmppc_get_msr(vcpu);
+	pc = kvmppc_get_pc(vcpu);
+	new_msr = vcpu->arch.intr_msr;
+	new_pc = to_book3s(vcpu)->hior + vec;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* If transactional, change to suspend mode on IRQ delivery */
+	if (MSR_TM_TRANSACTIONAL(msr))
+		new_msr |= MSR_TS_S;
+	else
+		new_msr |= msr & MSR_TS_MASK;
+#endif
+
+	kvmppc_set_srr0(vcpu, pc);
+	kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags);
+	kvmppc_set_pc(vcpu, new_pc);
+	kvmppc_set_msr(vcpu, new_msr);
+}
+
+static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb));
+	svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
+	svcpu->in_use = 0;
+	svcpu_put(svcpu);
+
+	/* Disable AIL if supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (cpu_has_feature(CPU_FTR_ARCH_207S))
+			mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL);
+		if (cpu_has_feature(CPU_FTR_ARCH_300) && (current->thread.fscr & FSCR_SCV))
+			mtspr(SPRN_FSCR, mfspr(SPRN_FSCR) & ~FSCR_SCV);
+	}
+#endif
+
+	vcpu->cpu = smp_processor_id();
+#ifdef CONFIG_PPC_BOOK3S_32
+	current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu;
+#endif
+
+	if (kvmppc_is_split_real(vcpu))
+		kvmppc_fixup_split_real(vcpu);
+
+	kvmppc_restore_tm_pr(vcpu);
+}
+
+static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	if (svcpu->in_use) {
+		kvmppc_copy_from_svcpu(vcpu);
+	}
+	memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
+	to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
+	svcpu_put(svcpu);
+
+	/* Enable AIL if supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (cpu_has_feature(CPU_FTR_ARCH_207S))
+			mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3);
+		if (cpu_has_feature(CPU_FTR_ARCH_300) && (current->thread.fscr & FSCR_SCV))
+			mtspr(SPRN_FSCR, mfspr(SPRN_FSCR) | FSCR_SCV);
+	}
+#endif
+
+	if (kvmppc_is_split_real(vcpu))
+		kvmppc_unfixup_split_real(vcpu);
+
+	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+	kvmppc_save_tm_pr(vcpu);
+
+	vcpu->cpu = -1;
+}
+
+/* Copy data needed by real-mode code from vcpu to shadow vcpu */
+void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+
+	svcpu->gpr[0] = vcpu->arch.regs.gpr[0];
+	svcpu->gpr[1] = vcpu->arch.regs.gpr[1];
+	svcpu->gpr[2] = vcpu->arch.regs.gpr[2];
+	svcpu->gpr[3] = vcpu->arch.regs.gpr[3];
+	svcpu->gpr[4] = vcpu->arch.regs.gpr[4];
+	svcpu->gpr[5] = vcpu->arch.regs.gpr[5];
+	svcpu->gpr[6] = vcpu->arch.regs.gpr[6];
+	svcpu->gpr[7] = vcpu->arch.regs.gpr[7];
+	svcpu->gpr[8] = vcpu->arch.regs.gpr[8];
+	svcpu->gpr[9] = vcpu->arch.regs.gpr[9];
+	svcpu->gpr[10] = vcpu->arch.regs.gpr[10];
+	svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
+	svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
+	svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
+	svcpu->cr  = vcpu->arch.regs.ccr;
+	svcpu->xer = vcpu->arch.regs.xer;
+	svcpu->ctr = vcpu->arch.regs.ctr;
+	svcpu->lr  = vcpu->arch.regs.link;
+	svcpu->pc  = vcpu->arch.regs.nip;
+#ifdef CONFIG_PPC_BOOK3S_64
+	svcpu->shadow_fscr = vcpu->arch.shadow_fscr;
+#endif
+	/*
+	 * Now also save the current time base value. We use this
+	 * to find the guest purr and spurr value.
+	 */
+	vcpu->arch.entry_tb = get_tb();
+	vcpu->arch.entry_vtb = get_vtb();
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		vcpu->arch.entry_ic = mfspr(SPRN_IC);
+	svcpu->in_use = true;
+
+	svcpu_put(svcpu);
+}
+
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+	ulong guest_msr = kvmppc_get_msr(vcpu);
+	ulong smsr = guest_msr;
+
+	/* Guest MSR values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
+		MSR_TM | MSR_TS_MASK;
+#else
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
+#endif
+	/* Process MSR values */
+	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+	/* External providers the guest reserved */
+	smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
+	/* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+	smsr |= MSR_HV;
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * in guest privileged state, we want to fail all TM transactions.
+	 * So disable MSR TM bit so that all tbegin. will be able to be
+	 * trapped into host.
+	 */
+	if (!(guest_msr & MSR_PR))
+		smsr &= ~MSR_TM;
+#endif
+	vcpu->arch.shadow_msr = smsr;
+}
+
+/* Copy data touched by real-mode code from shadow vcpu back to vcpu */
+void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	ulong old_msr;
+#endif
+
+	/*
+	 * Maybe we were already preempted and synced the svcpu from
+	 * our preempt notifiers. Don't bother touching this svcpu then.
+	 */
+	if (!svcpu->in_use)
+		goto out;
+
+	vcpu->arch.regs.gpr[0] = svcpu->gpr[0];
+	vcpu->arch.regs.gpr[1] = svcpu->gpr[1];
+	vcpu->arch.regs.gpr[2] = svcpu->gpr[2];
+	vcpu->arch.regs.gpr[3] = svcpu->gpr[3];
+	vcpu->arch.regs.gpr[4] = svcpu->gpr[4];
+	vcpu->arch.regs.gpr[5] = svcpu->gpr[5];
+	vcpu->arch.regs.gpr[6] = svcpu->gpr[6];
+	vcpu->arch.regs.gpr[7] = svcpu->gpr[7];
+	vcpu->arch.regs.gpr[8] = svcpu->gpr[8];
+	vcpu->arch.regs.gpr[9] = svcpu->gpr[9];
+	vcpu->arch.regs.gpr[10] = svcpu->gpr[10];
+	vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
+	vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
+	vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
+	vcpu->arch.regs.ccr  = svcpu->cr;
+	vcpu->arch.regs.xer = svcpu->xer;
+	vcpu->arch.regs.ctr = svcpu->ctr;
+	vcpu->arch.regs.link  = svcpu->lr;
+	vcpu->arch.regs.nip  = svcpu->pc;
+	vcpu->arch.shadow_srr1 = svcpu->shadow_srr1;
+	vcpu->arch.fault_dar   = svcpu->fault_dar;
+	vcpu->arch.fault_dsisr = svcpu->fault_dsisr;
+	vcpu->arch.last_inst   = svcpu->last_inst;
+#ifdef CONFIG_PPC_BOOK3S_64
+	vcpu->arch.shadow_fscr = svcpu->shadow_fscr;
+#endif
+	/*
+	 * Update purr and spurr using time base on exit.
+	 */
+	vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
+	vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
+	to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * Unlike other MSR bits, MSR[TS]bits can be changed at guest without
+	 * notifying host:
+	 *  modified by unprivileged instructions like "tbegin"/"tend"/
+	 * "tresume"/"tsuspend" in PR KVM guest.
+	 *
+	 * It is necessary to sync here to calculate a correct shadow_msr.
+	 *
+	 * privileged guest's tbegin will be failed at present. So we
+	 * only take care of problem state guest.
+	 */
+	old_msr = kvmppc_get_msr(vcpu);
+	if (unlikely((old_msr & MSR_PR) &&
+		(vcpu->arch.shadow_srr1 & (MSR_TS_MASK)) !=
+				(old_msr & (MSR_TS_MASK)))) {
+		old_msr &= ~(MSR_TS_MASK);
+		old_msr |= (vcpu->arch.shadow_srr1 & (MSR_TS_MASK));
+		kvmppc_set_msr_fast(vcpu, old_msr);
+		kvmppc_recalc_shadow_msr(vcpu);
+	}
+#endif
+
+	svcpu->in_use = false;
+
+out:
+	svcpu_put(svcpu);
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
+{
+	tm_enable();
+	vcpu->arch.tfhar = mfspr(SPRN_TFHAR);
+	vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+	vcpu->arch.tfiar = mfspr(SPRN_TFIAR);
+	tm_disable();
+}
+
+void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu)
+{
+	tm_enable();
+	mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
+	mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+	mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+	tm_disable();
+}
+
+/* loadup math bits which is enabled at kvmppc_get_msr() but not enabled at
+ * hardware.
+ */
+static void kvmppc_handle_lost_math_exts(struct kvm_vcpu *vcpu)
+{
+	ulong exit_nr;
+	ulong ext_diff = (kvmppc_get_msr(vcpu) & ~vcpu->arch.guest_owned_ext) &
+		(MSR_FP | MSR_VEC | MSR_VSX);
+
+	if (!ext_diff)
+		return;
+
+	if (ext_diff == MSR_FP)
+		exit_nr = BOOK3S_INTERRUPT_FP_UNAVAIL;
+	else if (ext_diff == MSR_VEC)
+		exit_nr = BOOK3S_INTERRUPT_ALTIVEC;
+	else
+		exit_nr = BOOK3S_INTERRUPT_VSX;
+
+	kvmppc_handle_ext(vcpu, exit_nr, ext_diff);
+}
+
+void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
+{
+	if (!(MSR_TM_ACTIVE(kvmppc_get_msr(vcpu)))) {
+		kvmppc_save_tm_sprs(vcpu);
+		return;
+	}
+
+	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+	kvmppc_giveup_ext(vcpu, MSR_VSX);
+
+	preempt_disable();
+	_kvmppc_save_tm_pr(vcpu, mfmsr());
+	preempt_enable();
+}
+
+void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
+{
+	if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
+		kvmppc_restore_tm_sprs(vcpu);
+		if (kvmppc_get_msr(vcpu) & MSR_TM) {
+			kvmppc_handle_lost_math_exts(vcpu);
+			if (vcpu->arch.fscr & FSCR_TAR)
+				kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+		}
+		return;
+	}
+
+	preempt_disable();
+	_kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
+	preempt_enable();
+
+	if (kvmppc_get_msr(vcpu) & MSR_TM) {
+		kvmppc_handle_lost_math_exts(vcpu);
+		if (vcpu->arch.fscr & FSCR_TAR)
+			kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+	}
+}
+#endif
+
+static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
+{
+	int r = 1; /* Indicate we want to get back into the guest */
+
+	/* We misuse TLB_FLUSH to indicate that we want to clear
+	   all shadow cache entries */
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return r;
+}
+
+/************* MMU Notifiers *************/
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+				      range->end << PAGE_SHIFT);
+
+	return false;
+}
+
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return do_kvm_unmap_gfn(kvm, range);
+}
+
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/* XXX could be more clever ;) */
+	return false;
+}
+
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/* XXX could be more clever ;) */
+	return false;
+}
+
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/* The page will get remapped properly on its next fault */
+	return do_kvm_unmap_gfn(kvm, range);
+}
+
+/*****************************************/
+
+static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	ulong old_msr;
+
+	/* For PAPR guest, make sure MSR reflects guest mode */
+	if (vcpu->arch.papr_enabled)
+		msr = (msr & ~MSR_HV) | MSR_ME;
+
+#ifdef EXIT_DEBUG
+	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
+#endif
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* We should never target guest MSR to TS=10 && PR=0,
+	 * since we always fail transaction for guest privilege
+	 * state.
+	 */
+	if (!(msr & MSR_PR) && MSR_TM_TRANSACTIONAL(msr))
+		kvmppc_emulate_tabort(vcpu,
+			TM_CAUSE_KVM_FAC_UNAV | TM_CAUSE_PERSISTENT);
+#endif
+
+	old_msr = kvmppc_get_msr(vcpu);
+	msr &= to_book3s(vcpu)->msr_mask;
+	kvmppc_set_msr_fast(vcpu, msr);
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	if (msr & MSR_POW) {
+		if (!vcpu->arch.pending_exceptions) {
+			kvm_vcpu_halt(vcpu);
+			vcpu->stat.generic.halt_wakeup++;
+
+			/* Unset POW bit after we woke up */
+			msr &= ~MSR_POW;
+			kvmppc_set_msr_fast(vcpu, msr);
+		}
+	}
+
+	if (kvmppc_is_split_real(vcpu))
+		kvmppc_fixup_split_real(vcpu);
+	else
+		kvmppc_unfixup_split_real(vcpu);
+
+	if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) !=
+		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
+		kvmppc_mmu_flush_segments(vcpu);
+		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+		/* Preload magic page segment when in kernel mode */
+		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+			struct kvm_vcpu_arch *a = &vcpu->arch;
+
+			if (msr & MSR_DR)
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+			else
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+		}
+	}
+
+	/*
+	 * When switching from 32 to 64-bit, we may have a stale 32-bit
+	 * magic page around, we need to flush it. Typically 32-bit magic
+	 * page will be instantiated when calling into RTAS. Note: We
+	 * assume that such transition only happens while in kernel mode,
+	 * ie, we never transition from user 32-bit to kernel 64-bit with
+	 * a 32-bit magic page around.
+	 */
+	if (vcpu->arch.magic_page_pa &&
+	    !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) {
+		/* going from RTAS to normal kernel code */
+		kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa,
+				     ~0xFFFUL);
+	}
+
+	/* Preload FPU if it's enabled */
+	if (kvmppc_get_msr(vcpu) & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (kvmppc_get_msr(vcpu) & MSR_TM)
+		kvmppc_handle_lost_math_exts(vcpu);
+#endif
+}
+
+static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	u32 host_pvr;
+
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
+	vcpu->arch.pvr = pvr;
+#ifdef CONFIG_PPC_BOOK3S_64
+	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
+		kvmppc_mmu_book3s_64_init(vcpu);
+		if (!to_book3s(vcpu)->hior_explicit)
+			to_book3s(vcpu)->hior = 0xfff00000;
+		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+		vcpu->arch.cpu_type = KVM_CPU_3S_64;
+	} else
+#endif
+	{
+		kvmppc_mmu_book3s_32_init(vcpu);
+		if (!to_book3s(vcpu)->hior_explicit)
+			to_book3s(vcpu)->hior = 0;
+		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+		vcpu->arch.cpu_type = KVM_CPU_3S_32;
+	}
+
+	kvmppc_sanity_check(vcpu);
+
+	/* If we are in hypervisor level on 970, we can tell the CPU to
+	 * treat DCBZ as 32 bytes store */
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
+	    !strcmp(cur_cpu_spec->platform, "ppc970"))
+		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+
+	/* Cell performs badly if MSR_FEx are set. So let's hope nobody
+	   really needs them in a VM on Cell and force disable them. */
+	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
+		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
+
+	/*
+	 * If they're asking for POWER6 or later, set the flag
+	 * indicating that we can do multiple large page sizes
+	 * and 1TB segments.
+	 * Also set the flag that indicates that tlbie has the large
+	 * page bit in the RB operand instead of the instruction.
+	 */
+	switch (PVR_VER(pvr)) {
+	case PVR_POWER6:
+	case PVR_POWER7:
+	case PVR_POWER7p:
+	case PVR_POWER8:
+	case PVR_POWER8E:
+	case PVR_POWER8NVL:
+	case PVR_POWER9:
+		vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
+			BOOK3S_HFLAG_NEW_TLBIE;
+		break;
+	}
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	/* 32 bit Book3S always has 32 byte dcbz */
+	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+#endif
+
+	/* On some CPUs we can execute paired single operations natively */
+	asm ( "mfpvr %0" : "=r"(host_pvr));
+	switch (host_pvr) {
+	case 0x00080200:	/* lonestar 2.0 */
+	case 0x00088202:	/* lonestar 2.2 */
+	case 0x70000100:	/* gekko 1.0 */
+	case 0x00080100:	/* gekko 2.0 */
+	case 0x00083203:	/* gekko 2.3a */
+	case 0x00083213:	/* gekko 2.3b */
+	case 0x00083204:	/* gekko 2.4 */
+	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
+	case 0x00087200:	/* broadway */
+		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
+		/* Enable HID2.PSE - in case we need it later */
+		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
+	}
+}
+
+/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
+ * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
+ * emulate 32 bytes dcbz length.
+ *
+ * The Book3s_64 inventors also realized this case and implemented a special bit
+ * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
+ *
+ * My approach here is to patch the dcbz instruction on executing pages.
+ */
+static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	struct page *hpage;
+	u64 hpage_offset;
+	u32 *page;
+	int i;
+
+	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+	if (is_error_page(hpage))
+		return;
+
+	hpage_offset = pte->raddr & ~PAGE_MASK;
+	hpage_offset &= ~0xFFFULL;
+	hpage_offset /= 4;
+
+	get_page(hpage);
+	page = kmap_atomic(hpage);
+
+	/* patch dcbz into reserved instruction, so we trap */
+	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
+		if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ)
+			page[i] &= cpu_to_be32(0xfffffff7);
+
+	kunmap_atomic(page);
+	put_page(hpage);
+}
+
+static bool kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF))
+		mp_pa = (uint32_t)mp_pa;
+
+	gpa &= ~0xFFFULL;
+	if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) == (gpa & KVM_PAM))) {
+		return true;
+	}
+
+	return kvm_is_visible_gfn(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+static int kvmppc_handle_pagefault(struct kvm_vcpu *vcpu,
+			    ulong eaddr, int vec)
+{
+	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+	bool iswrite = false;
+	int r = RESUME_GUEST;
+	int relocated;
+	int page_found = 0;
+	struct kvmppc_pte pte = { 0 };
+	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
+	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
+	u64 vsid;
+
+	relocated = data ? dr : ir;
+	if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE))
+		iswrite = true;
+
+	/* Resolve real address if translation turned on */
+	if (relocated) {
+		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite);
+	} else {
+		pte.may_execute = true;
+		pte.may_read = true;
+		pte.may_write = true;
+		pte.raddr = eaddr & KVM_PAM;
+		pte.eaddr = eaddr;
+		pte.vpage = eaddr >> 12;
+		pte.page_size = MMU_PAGE_64K;
+		pte.wimg = HPTE_R_M;
+	}
+
+	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {
+	case 0:
+		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
+		break;
+	case MSR_DR:
+		if (!data &&
+		    (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) &&
+		    ((pte.raddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS))
+			pte.raddr &= ~SPLIT_HACK_MASK;
+		fallthrough;
+	case MSR_IR:
+		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+
+		if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR)
+			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
+		else
+			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
+		pte.vpage |= vsid;
+
+		if (vsid == -1)
+			page_found = -EINVAL;
+		break;
+	}
+
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+		/*
+		 * If we do the dcbz hack, we have to NX on every execution,
+		 * so we can patch the executing code. This renders our guest
+		 * NX-less.
+		 */
+		pte.may_execute = !data;
+	}
+
+	if (page_found == -ENOENT || page_found == -EPERM) {
+		/* Page not found in guest PTE entries, or protection fault */
+		u64 flags;
+
+		if (page_found == -EPERM)
+			flags = DSISR_PROTFAULT;
+		else
+			flags = DSISR_NOHPTE;
+		if (data) {
+			flags |= vcpu->arch.fault_dsisr & DSISR_ISSTORE;
+			kvmppc_core_queue_data_storage(vcpu, 0, eaddr, flags);
+		} else {
+			kvmppc_core_queue_inst_storage(vcpu, flags);
+		}
+	} else if (page_found == -EINVAL) {
+		/* Page not found in guest SLB */
+		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
+	} else if (kvmppc_visible_gpa(vcpu, pte.raddr)) {
+		if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) {
+			/*
+			 * There is already a host HPTE there, presumably
+			 * a read-only one for a page the guest thinks
+			 * is writable, so get rid of it first.
+			 */
+			kvmppc_mmu_unmap_page(vcpu, &pte);
+		}
+		/* The guest's PTE is not mapped yet. Map on the host */
+		if (kvmppc_mmu_map_page(vcpu, &pte, iswrite) == -EIO) {
+			/* Exit KVM if mapping failed */
+			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+		if (data)
+			vcpu->stat.sp_storage++;
+		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+			kvmppc_patch_dcbz(vcpu, &pte);
+	} else {
+		/* MMIO */
+		vcpu->stat.mmio_exits++;
+		vcpu->arch.paddr_accessed = pte.raddr;
+		vcpu->arch.vaddr_accessed = pte.eaddr;
+		r = kvmppc_emulate_mmio(vcpu);
+		if ( r == RESUME_HOST_NV )
+			r = RESUME_HOST;
+	}
+
+	return r;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+
+	/*
+	 * VSX instructions can access FP and vector registers, so if
+	 * we are giving up VSX, make sure we give up FP and VMX as well.
+	 */
+	if (msr & MSR_VSX)
+		msr |= MSR_FP | MSR_VEC;
+
+	msr &= vcpu->arch.guest_owned_ext;
+	if (!msr)
+		return;
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+	if (msr & MSR_FP) {
+		/*
+		 * Note that on CPUs with VSX, giveup_fpu stores
+		 * both the traditional FP registers and the added VSX
+		 * registers into thread.fp_state.fpr[].
+		 */
+		if (t->regs->msr & MSR_FP)
+			giveup_fpu(current);
+		t->fp_save_area = NULL;
+	}
+
+#ifdef CONFIG_ALTIVEC
+	if (msr & MSR_VEC) {
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
+		t->vr_save_area = NULL;
+	}
+#endif
+
+	vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX);
+	kvmppc_recalc_shadow_msr(vcpu);
+}
+
+/* Give up facility (TAR / EBB / DSCR) */
+void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) {
+		/* Facility not available to the guest, ignore giveup request*/
+		return;
+	}
+
+	switch (fac) {
+	case FSCR_TAR_LG:
+		vcpu->arch.tar = mfspr(SPRN_TAR);
+		mtspr(SPRN_TAR, current->thread.tar);
+		vcpu->arch.shadow_fscr &= ~FSCR_TAR;
+		break;
+	}
+#endif
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+
+	/* When we have paired singles, we emulate in software */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
+		return RESUME_GUEST;
+
+	if (!(kvmppc_get_msr(vcpu) & msr)) {
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		return RESUME_GUEST;
+	}
+
+	if (msr == MSR_VSX) {
+		/* No VSX?  Give an illegal instruction interrupt */
+#ifdef CONFIG_VSX
+		if (!cpu_has_feature(CPU_FTR_VSX))
+#endif
+		{
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+			return RESUME_GUEST;
+		}
+
+		/*
+		 * We have to load up all the FP and VMX registers before
+		 * we can let the guest use VSX instructions.
+		 */
+		msr = MSR_FP | MSR_VEC | MSR_VSX;
+	}
+
+	/* See if we already own all the ext(s) needed */
+	msr &= ~vcpu->arch.guest_owned_ext;
+	if (!msr)
+		return RESUME_GUEST;
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+	if (msr & MSR_FP) {
+		preempt_disable();
+		enable_kernel_fp();
+		load_fp_state(&vcpu->arch.fp);
+		disable_kernel_fp();
+		t->fp_save_area = &vcpu->arch.fp;
+		preempt_enable();
+	}
+
+	if (msr & MSR_VEC) {
+#ifdef CONFIG_ALTIVEC
+		preempt_disable();
+		enable_kernel_altivec();
+		load_vr_state(&vcpu->arch.vr);
+		disable_kernel_altivec();
+		t->vr_save_area = &vcpu->arch.vr;
+		preempt_enable();
+#endif
+	}
+
+	t->regs->msr |= msr;
+	vcpu->arch.guest_owned_ext |= msr;
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	return RESUME_GUEST;
+}
+
+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+	unsigned long lost_ext;
+
+	lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
+	if (!lost_ext)
+		return;
+
+	if (lost_ext & MSR_FP) {
+		preempt_disable();
+		enable_kernel_fp();
+		load_fp_state(&vcpu->arch.fp);
+		disable_kernel_fp();
+		preempt_enable();
+	}
+#ifdef CONFIG_ALTIVEC
+	if (lost_ext & MSR_VEC) {
+		preempt_disable();
+		enable_kernel_altivec();
+		load_vr_state(&vcpu->arch.vr);
+		disable_kernel_altivec();
+		preempt_enable();
+	}
+#endif
+	current->thread.regs->msr |= lost_ext;
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac)
+{
+	/* Inject the Interrupt Cause field and trigger a guest interrupt */
+	vcpu->arch.fscr &= ~(0xffULL << 56);
+	vcpu->arch.fscr |= (fac << 56);
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL);
+}
+
+static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+	enum emulation_result er = EMULATE_FAIL;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_PR))
+		er = kvmppc_emulate_instruction(vcpu);
+
+	if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) {
+		/* Couldn't emulate, trigger interrupt in guest */
+		kvmppc_trigger_fac_interrupt(vcpu, fac);
+	}
+}
+
+/* Enable facilities (TAR, EBB, DSCR) for the guest */
+static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+	bool guest_fac_enabled;
+	BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S));
+
+	/*
+	 * Not every facility is enabled by FSCR bits, check whether the
+	 * guest has this facility enabled at all.
+	 */
+	switch (fac) {
+	case FSCR_TAR_LG:
+	case FSCR_EBB_LG:
+		guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac));
+		break;
+	case FSCR_TM_LG:
+		guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM;
+		break;
+	default:
+		guest_fac_enabled = false;
+		break;
+	}
+
+	if (!guest_fac_enabled) {
+		/* Facility not enabled by the guest */
+		kvmppc_trigger_fac_interrupt(vcpu, fac);
+		return RESUME_GUEST;
+	}
+
+	switch (fac) {
+	case FSCR_TAR_LG:
+		/* TAR switching isn't lazy in Linux yet */
+		current->thread.tar = mfspr(SPRN_TAR);
+		mtspr(SPRN_TAR, vcpu->arch.tar);
+		vcpu->arch.shadow_fscr |= FSCR_TAR;
+		break;
+	default:
+		kvmppc_emulate_fac(vcpu, fac);
+		break;
+	}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Since we disabled MSR_TM at privilege state, the mfspr instruction
+	 * for TM spr can trigger TM fac unavailable. In this case, the
+	 * emulation is handled by kvmppc_emulate_fac(), which invokes
+	 * kvmppc_emulate_mfspr() finally. But note the mfspr can include
+	 * RT for NV registers. So it need to restore those NV reg to reflect
+	 * the update.
+	 */
+	if ((fac == FSCR_TM_LG) && !(kvmppc_get_msr(vcpu) & MSR_PR))
+		return RESUME_GUEST_NV;
+#endif
+
+	return RESUME_GUEST;
+}
+
+void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr)
+{
+	if (fscr & FSCR_SCV)
+		fscr &= ~FSCR_SCV; /* SCV must not be enabled */
+	/* Prohibit prefixed instructions for now */
+	fscr &= ~FSCR_PREFIX;
+	if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) {
+		/* TAR got dropped, drop it in shadow too */
+		kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+	} else if (!(vcpu->arch.fscr & FSCR_TAR) && (fscr & FSCR_TAR)) {
+		vcpu->arch.fscr = fscr;
+		kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+		return;
+	}
+
+	vcpu->arch.fscr = fscr;
+}
+#endif
+
+static void kvmppc_setup_debug(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+		u64 msr = kvmppc_get_msr(vcpu);
+
+		kvmppc_set_msr(vcpu, msr | MSR_SE);
+	}
+}
+
+static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+		u64 msr = kvmppc_get_msr(vcpu);
+
+		kvmppc_set_msr(vcpu, msr & ~MSR_SE);
+	}
+}
+
+static int kvmppc_exit_pr_progint(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+	enum emulation_result er;
+	ulong flags;
+	ppc_inst_t last_inst;
+	int emul, r;
+
+	/*
+	 * shadow_srr1 only contains valid flags if we came here via a program
+	 * exception. The other exceptions (emulation assist, FP unavailable,
+	 * etc.) do not provide flags in SRR1, so use an illegal-instruction
+	 * exception when injecting a program interrupt into the guest.
+	 */
+	if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+		flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+	else
+		flags = SRR1_PROGILL;
+
+	emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+	if (emul != EMULATE_DONE)
+		return RESUME_GUEST;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+		pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+			kvmppc_get_pc(vcpu), ppc_inst_val(last_inst));
+#endif
+		if ((ppc_inst_val(last_inst) & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
+			kvmppc_core_queue_program(vcpu, flags);
+			return RESUME_GUEST;
+		}
+	}
+
+	vcpu->stat.emulated_inst_exits++;
+	er = kvmppc_emulate_instruction(vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		r = RESUME_GUEST_NV;
+		break;
+	case EMULATE_AGAIN:
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_FAIL:
+		pr_crit("%s: emulation at %lx failed (%08x)\n",
+			__func__, kvmppc_get_pc(vcpu), ppc_inst_val(last_inst));
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_DO_MMIO:
+		vcpu->run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST_NV;
+		break;
+	case EMULATE_EXIT_USER:
+		r = RESUME_HOST_NV;
+		break;
+	default:
+		BUG();
+	}
+
+	return r;
+}
+
+int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+	struct kvm_run *run = vcpu->run;
+	int r = RESUME_HOST;
+	int s;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+
+	/* We get here with MSR.EE=1 */
+
+	trace_kvm_exit(exit_nr, vcpu);
+	guest_exit();
+
+	switch (exit_nr) {
+	case BOOK3S_INTERRUPT_INST_STORAGE:
+	{
+		ulong shadow_srr1 = vcpu->arch.shadow_srr1;
+		vcpu->stat.pf_instruc++;
+
+		if (kvmppc_is_split_real(vcpu))
+			kvmppc_fixup_split_real(vcpu);
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		{
+			struct kvmppc_book3s_shadow_vcpu *svcpu;
+			u32 sr;
+
+			svcpu = svcpu_get(vcpu);
+			sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT];
+			svcpu_put(svcpu);
+			if (sr == SR_INVALID) {
+				kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+#endif
+
+		/* only care about PTEG not found errors, but leave NX alone */
+		if (shadow_srr1 & 0x40000000) {
+			int idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_handle_pagefault(vcpu, kvmppc_get_pc(vcpu), exit_nr);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+			vcpu->stat.sp_instruc++;
+		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+			/*
+			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
+			 *     so we can't use the NX bit inside the guest. Let's cross our fingers,
+			 *     that no guest that needs the dcbz hack does NX.
+			 */
+			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
+			r = RESUME_GUEST;
+		} else {
+			kvmppc_core_queue_inst_storage(vcpu,
+						shadow_srr1 & 0x58000000);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_DATA_STORAGE:
+	{
+		ulong dar = kvmppc_get_fault_dar(vcpu);
+		u32 fault_dsisr = vcpu->arch.fault_dsisr;
+		vcpu->stat.pf_storage++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		{
+			struct kvmppc_book3s_shadow_vcpu *svcpu;
+			u32 sr;
+
+			svcpu = svcpu_get(vcpu);
+			sr = svcpu->sr[dar >> SID_SHIFT];
+			svcpu_put(svcpu);
+			if (sr == SR_INVALID) {
+				kvmppc_mmu_map_segment(vcpu, dar);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+#endif
+
+		/*
+		 * We need to handle missing shadow PTEs, and
+		 * protection faults due to us mapping a page read-only
+		 * when the guest thinks it is writable.
+		 */
+		if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) {
+			int idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_handle_pagefault(vcpu, dar, exit_nr);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		} else {
+			kvmppc_core_queue_data_storage(vcpu, 0, dar, fault_dsisr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_DATA_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
+			kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_DATA_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_INST_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_INST_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_DECREMENTER:
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+	case BOOK3S_INTERRUPT_DOORBELL:
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+	case BOOK3S_INTERRUPT_EXTERNAL_HV:
+	case BOOK3S_INTERRUPT_H_VIRT:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_HMI:
+	case BOOK3S_INTERRUPT_PERFMON:
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		r = kvmppc_exit_pr_progint(vcpu, exit_nr);
+		break;
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		ppc_inst_t last_sc;
+		int emul;
+
+		/* Get last sc for papr */
+		if (vcpu->arch.papr_enabled) {
+			/* The sc instruction points SRR0 to the next inst */
+			emul = kvmppc_get_last_inst(vcpu, INST_SC, &last_sc);
+			if (emul != EMULATE_DONE) {
+				kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) - 4);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+
+		if (vcpu->arch.papr_enabled &&
+		    (ppc_inst_val(last_sc) == 0x44000022) &&
+		    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+			/* SC 1 papr hypercalls */
+			ulong cmd = kvmppc_get_gpr(vcpu, 3);
+			int i;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
+				r = RESUME_GUEST;
+				break;
+			}
+#endif
+
+			run->papr_hcall.nr = cmd;
+			for (i = 0; i < 9; ++i) {
+				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+				run->papr_hcall.args[i] = gpr;
+			}
+			run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->arch.hcall_needed = 1;
+			r = RESUME_HOST;
+		} else if (vcpu->arch.osi_enabled &&
+		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+			/* MOL hypercalls */
+			u64 *gprs = run->osi.gprs;
+			int i;
+
+			run->exit_reason = KVM_EXIT_OSI;
+			for (i = 0; i < 32; i++)
+				gprs[i] = kvmppc_get_gpr(vcpu, i);
+			vcpu->arch.osi_needed = 1;
+			r = RESUME_HOST_NV;
+		} else if (!(kvmppc_get_msr(vcpu) & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			vcpu->stat.syscall_exits++;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_FP_UNAVAIL:
+	case BOOK3S_INTERRUPT_ALTIVEC:
+	case BOOK3S_INTERRUPT_VSX:
+	{
+		int ext_msr = 0;
+		int emul;
+		ppc_inst_t last_inst;
+
+		if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) {
+			/* Do paired single instruction emulation */
+			emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
+						    &last_inst);
+			if (emul == EMULATE_DONE)
+				r = kvmppc_exit_pr_progint(vcpu, exit_nr);
+			else
+				r = RESUME_GUEST;
+
+			break;
+		}
+
+		/* Enable external provider */
+		switch (exit_nr) {
+		case BOOK3S_INTERRUPT_FP_UNAVAIL:
+			ext_msr = MSR_FP;
+			break;
+
+		case BOOK3S_INTERRUPT_ALTIVEC:
+			ext_msr = MSR_VEC;
+			break;
+
+		case BOOK3S_INTERRUPT_VSX:
+			ext_msr = MSR_VSX;
+			break;
+		}
+
+		r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+		break;
+	}
+	case BOOK3S_INTERRUPT_ALIGNMENT:
+	{
+		ppc_inst_t last_inst;
+		int emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+
+		if (emul == EMULATE_DONE) {
+			u32 dsisr;
+			u64 dar;
+
+			dsisr = kvmppc_alignment_dsisr(vcpu, ppc_inst_val(last_inst));
+			dar = kvmppc_alignment_dar(vcpu, ppc_inst_val(last_inst));
+
+			kvmppc_set_dsisr(vcpu, dsisr);
+			kvmppc_set_dar(vcpu, dar);
+
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		}
+		r = RESUME_GUEST;
+		break;
+	}
+#ifdef CONFIG_PPC_BOOK3S_64
+	case BOOK3S_INTERRUPT_FAC_UNAVAIL:
+		r = kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
+		break;
+#endif
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_TRACE:
+		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+			run->exit_reason = KVM_EXIT_DEBUG;
+			r = RESUME_HOST;
+		} else {
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	default:
+	{
+		ulong shadow_srr1 = vcpu->arch.shadow_srr1;
+		/* Ugh - bork here! What did we get? */
+		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+			exit_nr, kvmppc_get_pc(vcpu), shadow_srr1);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+	}
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+
+		/*
+		 * Interrupts could be timers for the guest which we have to
+		 * inject again, so let's postpone them until we're in the guest
+		 * and if we really did time things so badly, then we just exit
+		 * again due to a host external interrupt.
+		 */
+		s = kvmppc_prepare_to_enter(vcpu);
+		if (s <= 0)
+			r = s;
+		else {
+			/* interrupts now hard-disabled */
+			kvmppc_fix_ee_before_entry();
+		}
+
+		kvmppc_handle_lost_ext(vcpu);
+	}
+
+	trace_kvm_book3s_reenter(r, vcpu);
+
+	return r;
+}
+
+static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+		}
+	} else {
+		for (i = 0; i < 16; i++)
+			sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i);
+
+		for (i = 0; i < 8; i++) {
+			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
+			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
+		}
+	}
+
+	return 0;
+}
+
+static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	kvmppc_set_pvr_pr(vcpu, sregs->pvr);
+
+	vcpu3s->sdr1 = sregs->u.s.sdr1;
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		/* Flush all SLB entries */
+		vcpu->arch.mmu.slbmte(vcpu, 0, 0);
+		vcpu->arch.mmu.slbia(vcpu);
+
+		for (i = 0; i < 64; i++) {
+			u64 rb = sregs->u.s.ppc64.slb[i].slbe;
+			u64 rs = sregs->u.s.ppc64.slb[i].slbv;
+
+			if (rb & SLB_ESID_V)
+				vcpu->arch.mmu.slbmte(vcpu, rs, rb);
+		}
+	} else
+#endif
+	{
+		for (i = 0; i < 16; i++) {
+			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
+		}
+		for (i = 0; i < 8; i++) {
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
+				       (u32)sregs->u.s.ppc32.ibat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
+				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
+				       (u32)sregs->u.s.ppc32.dbat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
+				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
+		}
+	}
+
+	/* Flush the MMU after messing with the segments */
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return 0;
+}
+
+static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_DEBUG_INST:
+		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
+		break;
+	case KVM_REG_PPC_HIOR:
+		*val = get_reg_val(id, to_book3s(vcpu)->hior);
+		break;
+	case KVM_REG_PPC_VTB:
+		*val = get_reg_val(id, to_book3s(vcpu)->vtb);
+		break;
+	case KVM_REG_PPC_LPCR:
+	case KVM_REG_PPC_LPCR_64:
+		/*
+		 * We are only interested in the LPCR_ILE bit
+		 */
+		if (vcpu->arch.intr_msr & MSR_LE)
+			*val = get_reg_val(id, LPCR_ILE);
+		else
+			*val = get_reg_val(id, 0);
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_REG_PPC_TFHAR:
+		*val = get_reg_val(id, vcpu->arch.tfhar);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		*val = get_reg_val(id, vcpu->arch.tfiar);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		*val = get_reg_val(id, vcpu->arch.texasr);
+		break;
+	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+		*val = get_reg_val(id,
+				vcpu->arch.gpr_tm[id-KVM_REG_PPC_TM_GPR0]);
+		break;
+	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+	{
+		int i, j;
+
+		i = id - KVM_REG_PPC_TM_VSR0;
+		if (i < 32)
+			for (j = 0; j < TS_FPRWIDTH; j++)
+				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+		else {
+			if (cpu_has_feature(CPU_FTR_ALTIVEC))
+				val->vval = vcpu->arch.vr_tm.vr[i-32];
+			else
+				r = -ENXIO;
+		}
+		break;
+	}
+	case KVM_REG_PPC_TM_CR:
+		*val = get_reg_val(id, vcpu->arch.cr_tm);
+		break;
+	case KVM_REG_PPC_TM_XER:
+		*val = get_reg_val(id, vcpu->arch.xer_tm);
+		break;
+	case KVM_REG_PPC_TM_LR:
+		*val = get_reg_val(id, vcpu->arch.lr_tm);
+		break;
+	case KVM_REG_PPC_TM_CTR:
+		*val = get_reg_val(id, vcpu->arch.ctr_tm);
+		break;
+	case KVM_REG_PPC_TM_FPSCR:
+		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+		break;
+	case KVM_REG_PPC_TM_AMR:
+		*val = get_reg_val(id, vcpu->arch.amr_tm);
+		break;
+	case KVM_REG_PPC_TM_PPR:
+		*val = get_reg_val(id, vcpu->arch.ppr_tm);
+		break;
+	case KVM_REG_PPC_TM_VRSAVE:
+		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
+		break;
+	case KVM_REG_PPC_TM_VSCR:
+		if (cpu_has_feature(CPU_FTR_ALTIVEC))
+			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+		else
+			r = -ENXIO;
+		break;
+	case KVM_REG_PPC_TM_DSCR:
+		*val = get_reg_val(id, vcpu->arch.dscr_tm);
+		break;
+	case KVM_REG_PPC_TM_TAR:
+		*val = get_reg_val(id, vcpu->arch.tar_tm);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
+{
+	if (new_lpcr & LPCR_ILE)
+		vcpu->arch.intr_msr |= MSR_LE;
+	else
+		vcpu->arch.intr_msr &= ~MSR_LE;
+}
+
+static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_HIOR:
+		to_book3s(vcpu)->hior = set_reg_val(id, *val);
+		to_book3s(vcpu)->hior_explicit = true;
+		break;
+	case KVM_REG_PPC_VTB:
+		to_book3s(vcpu)->vtb = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_LPCR:
+	case KVM_REG_PPC_LPCR_64:
+		kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_REG_PPC_TFHAR:
+		vcpu->arch.tfhar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		vcpu->arch.tfiar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		vcpu->arch.texasr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+		vcpu->arch.gpr_tm[id - KVM_REG_PPC_TM_GPR0] =
+			set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+	{
+		int i, j;
+
+		i = id - KVM_REG_PPC_TM_VSR0;
+		if (i < 32)
+			for (j = 0; j < TS_FPRWIDTH; j++)
+				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+		else
+			if (cpu_has_feature(CPU_FTR_ALTIVEC))
+				vcpu->arch.vr_tm.vr[i-32] = val->vval;
+			else
+				r = -ENXIO;
+		break;
+	}
+	case KVM_REG_PPC_TM_CR:
+		vcpu->arch.cr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_XER:
+		vcpu->arch.xer_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_LR:
+		vcpu->arch.lr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_CTR:
+		vcpu->arch.ctr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_FPSCR:
+		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_AMR:
+		vcpu->arch.amr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_PPR:
+		vcpu->arch.ppr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VRSAVE:
+		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VSCR:
+		if (cpu_has_feature(CPU_FTR_ALTIVEC))
+			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+		else
+			r = -ENXIO;
+		break;
+	case KVM_REG_PPC_TM_DSCR:
+		vcpu->arch.dscr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_TAR:
+		vcpu->arch.tar_tm = set_reg_val(id, *val);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static int kvmppc_core_vcpu_create_pr(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s;
+	unsigned long p;
+	int err;
+
+	err = -ENOMEM;
+
+	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
+	if (!vcpu_book3s)
+		goto out;
+	vcpu->arch.book3s = vcpu_book3s;
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	vcpu->arch.shadow_vcpu =
+		kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL);
+	if (!vcpu->arch.shadow_vcpu)
+		goto free_vcpu3s;
+#endif
+
+	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		goto free_shadow_vcpu;
+	vcpu->arch.shared = (void *)p;
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Always start the shared struct in native endian mode */
+#ifdef __BIG_ENDIAN__
+        vcpu->arch.shared_big_endian = true;
+#else
+        vcpu->arch.shared_big_endian = false;
+#endif
+
+	/*
+	 * Default to the same as the host if we're on sufficiently
+	 * recent machine that we have 1TB segments;
+	 * otherwise default to PPC970FX.
+	 */
+	vcpu->arch.pvr = 0x3C0301;
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu->arch.intr_msr = MSR_SF;
+#else
+	/* default to book3s_32 (750) */
+	vcpu->arch.pvr = 0x84202;
+	vcpu->arch.intr_msr = 0;
+#endif
+	kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr);
+	vcpu->arch.slb_nr = 64;
+
+	vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE;
+
+	err = kvmppc_mmu_init_pr(vcpu);
+	if (err < 0)
+		goto free_shared_page;
+
+	return 0;
+
+free_shared_page:
+	free_page((unsigned long)vcpu->arch.shared);
+free_shadow_vcpu:
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	kfree(vcpu->arch.shadow_vcpu);
+free_vcpu3s:
+#endif
+	vfree(vcpu_book3s);
+out:
+	return err;
+}
+
+static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+
+	kvmppc_mmu_destroy_pr(vcpu);
+	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	kfree(vcpu->arch.shadow_vcpu);
+#endif
+	vfree(vcpu_book3s);
+}
+
+static int kvmppc_vcpu_run_pr(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	/* Check if we can run the vcpu at all */
+	if (!vcpu->arch.sane) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	kvmppc_setup_debug(vcpu);
+
+	/*
+	 * Interrupts could be timers for the guest which we have to inject
+	 * again, so let's postpone them until we're in the guest and if we
+	 * really did time things so badly, then we just exit again due to
+	 * a host external interrupt.
+	 */
+	ret = kvmppc_prepare_to_enter(vcpu);
+	if (ret <= 0)
+		goto out;
+	/* interrupts now hard-disabled */
+
+	/* Save FPU, Altivec and VSX state */
+	giveup_all(current);
+
+	/* Preload FPU if it's enabled */
+	if (kvmppc_get_msr(vcpu) & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+	kvmppc_fix_ee_before_entry();
+
+	ret = __kvmppc_vcpu_run(vcpu);
+
+	kvmppc_clear_debug(vcpu);
+
+	/* No need for guest_exit. It's done in handle_exit.
+	   We also get here with interrupts enabled. */
+
+	/* Make sure we save the guest FPU/Altivec/VSX state */
+	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+
+	/* Make sure we save the guest TAR/EBB/DSCR state */
+	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+
+	srr_regs_clobbered();
+out:
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	return ret;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
+					 struct kvm_dirty_log *log)
+{
+	struct kvm_memory_slot *memslot;
+	struct kvm_vcpu *vcpu;
+	ulong ga, ga_end;
+	int is_dirty = 0;
+	int r;
+	unsigned long n;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot);
+	if (r)
+		goto out;
+
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (is_dirty) {
+		ga = memslot->base_gfn << PAGE_SHIFT;
+		ga_end = ga + (memslot->npages << PAGE_SHIFT);
+
+		kvm_for_each_vcpu(n, vcpu, kvm)
+			kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
+
+		n = kvm_dirty_bitmap_bytes(memslot);
+		memset(memslot->dirty_bitmap, 0, n);
+	}
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
+static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
+					 struct kvm_memory_slot *memslot)
+{
+	return;
+}
+
+static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
+				const struct kvm_memory_slot *old,
+				struct kvm_memory_slot *new,
+				enum kvm_mr_change change)
+{
+	return 0;
+}
+
+static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
+				struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
+				enum kvm_mr_change change)
+{
+	return;
+}
+
+static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *slot)
+{
+	return;
+}
+
+#ifdef CONFIG_PPC64
+static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
+					 struct kvm_ppc_smmu_info *info)
+{
+	long int i;
+	struct kvm_vcpu *vcpu;
+
+	info->flags = 0;
+
+	/* SLB is always 64 entries */
+	info->slb_size = 64;
+
+	/* Standard 4k base page size segment */
+	info->sps[0].page_shift = 12;
+	info->sps[0].slb_enc = 0;
+	info->sps[0].enc[0].page_shift = 12;
+	info->sps[0].enc[0].pte_enc = 0;
+
+	/*
+	 * 64k large page size.
+	 * We only want to put this in if the CPUs we're emulating
+	 * support it, but unfortunately we don't have a vcpu easily
+	 * to hand here to test.  Just pick the first vcpu, and if
+	 * that doesn't exist yet, report the minimum capability,
+	 * i.e., no 64k pages.
+	 * 1T segment support goes along with 64k pages.
+	 */
+	i = 1;
+	vcpu = kvm_get_vcpu(kvm, 0);
+	if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
+		info->flags = KVM_PPC_1T_SEGMENTS;
+		info->sps[i].page_shift = 16;
+		info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01;
+		info->sps[i].enc[0].page_shift = 16;
+		info->sps[i].enc[0].pte_enc = 1;
+		++i;
+	}
+
+	/* Standard 16M large page size segment */
+	info->sps[i].page_shift = 24;
+	info->sps[i].slb_enc = SLB_VSID_L;
+	info->sps[i].enc[0].page_shift = 24;
+	info->sps[i].enc[0].pte_enc = 0;
+
+	return 0;
+}
+
+static int kvm_configure_mmu_pr(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+	/* Require flags and process table base and size to all be zero. */
+	if (cfg->flags || cfg->process_table)
+		return -EINVAL;
+	return 0;
+}
+
+#else
+static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
+					 struct kvm_ppc_smmu_info *info)
+{
+	/* We should not get called */
+	BUG();
+	return 0;
+}
+#endif /* CONFIG_PPC64 */
+
+static unsigned int kvm_global_user_count = 0;
+static DEFINE_SPINLOCK(kvm_global_user_count_lock);
+
+static int kvmppc_core_init_vm_pr(struct kvm *kvm)
+{
+	mutex_init(&kvm->arch.hpt_mutex);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Start out with the default set of hcalls enabled */
+	kvmppc_pr_init_default_hcalls(kvm);
+#endif
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		spin_lock(&kvm_global_user_count_lock);
+		if (++kvm_global_user_count == 1)
+			pseries_disable_reloc_on_exc();
+		spin_unlock(&kvm_global_user_count_lock);
+	}
+	return 0;
+}
+
+static void kvmppc_core_destroy_vm_pr(struct kvm *kvm)
+{
+#ifdef CONFIG_PPC64
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+#endif
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		spin_lock(&kvm_global_user_count_lock);
+		BUG_ON(kvm_global_user_count == 0);
+		if (--kvm_global_user_count == 0)
+			pseries_enable_reloc_on_exc();
+		spin_unlock(&kvm_global_user_count_lock);
+	}
+}
+
+static int kvmppc_core_check_processor_compat_pr(void)
+{
+	/*
+	 * PR KVM can work on POWER9 inside a guest partition
+	 * running in HPT mode.  It can't work if we are using
+	 * radix translation (because radix provides no way for
+	 * a process to have unique translations in quadrant 3).
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
+		return -EIO;
+	return 0;
+}
+
+static int kvm_arch_vm_ioctl_pr(struct file *filp,
+				unsigned int ioctl, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static struct kvmppc_ops kvm_ops_pr = {
+	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr,
+	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr,
+	.get_one_reg = kvmppc_get_one_reg_pr,
+	.set_one_reg = kvmppc_set_one_reg_pr,
+	.vcpu_load   = kvmppc_core_vcpu_load_pr,
+	.vcpu_put    = kvmppc_core_vcpu_put_pr,
+	.inject_interrupt = kvmppc_inject_interrupt_pr,
+	.set_msr     = kvmppc_set_msr_pr,
+	.vcpu_run    = kvmppc_vcpu_run_pr,
+	.vcpu_create = kvmppc_core_vcpu_create_pr,
+	.vcpu_free   = kvmppc_core_vcpu_free_pr,
+	.check_requests = kvmppc_core_check_requests_pr,
+	.get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr,
+	.flush_memslot = kvmppc_core_flush_memslot_pr,
+	.prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
+	.commit_memory_region = kvmppc_core_commit_memory_region_pr,
+	.unmap_gfn_range = kvm_unmap_gfn_range_pr,
+	.age_gfn  = kvm_age_gfn_pr,
+	.test_age_gfn = kvm_test_age_gfn_pr,
+	.set_spte_gfn = kvm_set_spte_gfn_pr,
+	.free_memslot = kvmppc_core_free_memslot_pr,
+	.init_vm = kvmppc_core_init_vm_pr,
+	.destroy_vm = kvmppc_core_destroy_vm_pr,
+	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr,
+	.emulate_op = kvmppc_core_emulate_op_pr,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_pr,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_pr,
+	.fast_vcpu_kick = kvm_vcpu_kick,
+	.arch_vm_ioctl  = kvm_arch_vm_ioctl_pr,
+#ifdef CONFIG_PPC_BOOK3S_64
+	.hcall_implemented = kvmppc_hcall_impl_pr,
+	.configure_mmu = kvm_configure_mmu_pr,
+#endif
+	.giveup_ext = kvmppc_giveup_ext,
+};
+
+
+int kvmppc_book3s_init_pr(void)
+{
+	int r;
+
+	r = kvmppc_core_check_processor_compat_pr();
+	if (r < 0)
+		return r;
+
+	kvm_ops_pr.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_pr;
+
+	r = kvmppc_mmu_hpte_sysinit();
+	return r;
+}
+
+void kvmppc_book3s_exit_pr(void)
+{
+	kvmppc_pr_ops = NULL;
+	kvmppc_mmu_hpte_sysexit();
+}
+
+/*
+ * We only support separate modules for book3s 64
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+
+module_init(kvmppc_book3s_init_pr);
+module_exit(kvmppc_book3s_exit_pr);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
+#endif
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
new file mode 100644
index 0000000000..b2c89e850d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011. Freescale Inc. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ *
+ * Hypercall handling for running PAPR guests in PR KVM on Book 3S
+ * processors.
+ */
+
+#include <linux/anon_inodes.h>
+
+#include <linux/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+#define HPTE_SIZE	16		/* bytes per HPT entry */
+
+static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	unsigned long pteg_addr;
+
+	pte_index <<= 4;
+	pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70;
+	pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+	pteg_addr |= pte_index;
+
+	return pteg_addr;
+}
+
+static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
+{
+	long flags = kvmppc_get_gpr(vcpu, 4);
+	long pte_index = kvmppc_get_gpr(vcpu, 5);
+	__be64 pteg[2 * 8];
+	__be64 *hpte;
+	unsigned long pteg_addr, i;
+	long int ret;
+
+	i = pte_index & 7;
+	pte_index &= ~7UL;
+	pteg_addr = get_pteg_addr(vcpu, pte_index);
+
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	ret = H_FUNCTION;
+	if (copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)))
+		goto done;
+	hpte = pteg;
+
+	ret = H_PTEG_FULL;
+	if (likely((flags & H_EXACT) == 0)) {
+		for (i = 0; ; ++i) {
+			if (i == 8)
+				goto done;
+			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0)
+				break;
+			hpte += 2;
+		}
+	} else {
+		hpte += i * 2;
+		if (*hpte & HPTE_V_VALID)
+			goto done;
+	}
+
+	hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6));
+	hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7));
+	pteg_addr += i * HPTE_SIZE;
+	ret = H_FUNCTION;
+	if (copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE))
+		goto done;
+	kvmppc_set_gpr(vcpu, 4, pte_index | i);
+	ret = H_SUCCESS;
+
+ done:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags= kvmppc_get_gpr(vcpu, 4);
+	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+	unsigned long v = 0, pteg, rb;
+	unsigned long pte[2];
+	long int ret;
+
+	pteg = get_pteg_addr(vcpu, pte_index);
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	ret = H_FUNCTION;
+	if (copy_from_user(pte, (void __user *)pteg, sizeof(pte)))
+		goto done;
+	pte[0] = be64_to_cpu((__force __be64)pte[0]);
+	pte[1] = be64_to_cpu((__force __be64)pte[1]);
+
+	ret = H_NOT_FOUND;
+	if ((pte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (pte[0] & avpn) != 0))
+		goto done;
+
+	ret = H_FUNCTION;
+	if (copy_to_user((void __user *)pteg, &v, sizeof(v)))
+		goto done;
+
+	rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
+	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+
+	ret = H_SUCCESS;
+	kvmppc_set_gpr(vcpu, 4, pte[0]);
+	kvmppc_set_gpr(vcpu, 5, pte[1]);
+
+ done:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+/* Request defs for kvmppc_h_pr_bulk_remove() */
+#define H_BULK_REMOVE_TYPE             0xc000000000000000ULL
+#define   H_BULK_REMOVE_REQUEST        0x4000000000000000ULL
+#define   H_BULK_REMOVE_RESPONSE       0x8000000000000000ULL
+#define   H_BULK_REMOVE_END            0xc000000000000000ULL
+#define H_BULK_REMOVE_CODE             0x3000000000000000ULL
+#define   H_BULK_REMOVE_SUCCESS        0x0000000000000000ULL
+#define   H_BULK_REMOVE_NOT_FOUND      0x1000000000000000ULL
+#define   H_BULK_REMOVE_PARM           0x2000000000000000ULL
+#define   H_BULK_REMOVE_HW             0x3000000000000000ULL
+#define H_BULK_REMOVE_RC               0x0c00000000000000ULL
+#define H_BULK_REMOVE_FLAGS            0x0300000000000000ULL
+#define   H_BULK_REMOVE_ABSOLUTE       0x0000000000000000ULL
+#define   H_BULK_REMOVE_ANDCOND        0x0100000000000000ULL
+#define   H_BULK_REMOVE_AVPN           0x0200000000000000ULL
+#define H_BULK_REMOVE_PTEX             0x00ffffffffffffffULL
+#define H_BULK_REMOVE_MAX_BATCH        4
+
+static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	int i;
+	int paramnr = 4;
+	int ret = H_SUCCESS;
+
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) {
+		unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i));
+		unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1);
+		unsigned long pteg, rb, flags;
+		unsigned long pte[2];
+		unsigned long v = 0;
+
+		if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) {
+			break; /* Exit success */
+		} else if ((tsh & H_BULK_REMOVE_TYPE) !=
+			   H_BULK_REMOVE_REQUEST) {
+			ret = H_PARAMETER;
+			break; /* Exit fail */
+		}
+
+		tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS;
+		tsh |= H_BULK_REMOVE_RESPONSE;
+
+		if ((tsh & H_BULK_REMOVE_ANDCOND) &&
+		    (tsh & H_BULK_REMOVE_AVPN)) {
+			tsh |= H_BULK_REMOVE_PARM;
+			kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
+			ret = H_PARAMETER;
+			break; /* Exit fail */
+		}
+
+		pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX);
+		if (copy_from_user(pte, (void __user *)pteg, sizeof(pte))) {
+			ret = H_FUNCTION;
+			break;
+		}
+		pte[0] = be64_to_cpu((__force __be64)pte[0]);
+		pte[1] = be64_to_cpu((__force __be64)pte[1]);
+
+		/* tsl = AVPN */
+		flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26;
+
+		if ((pte[0] & HPTE_V_VALID) == 0 ||
+		    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) ||
+		    ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) {
+			tsh |= H_BULK_REMOVE_NOT_FOUND;
+		} else {
+			/* Splat the pteg in (userland) hpt */
+			if (copy_to_user((void __user *)pteg, &v, sizeof(v))) {
+				ret = H_FUNCTION;
+				break;
+			}
+
+			rb = compute_tlbie_rb(pte[0], pte[1],
+					      tsh & H_BULK_REMOVE_PTEX);
+			vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+			tsh |= H_BULK_REMOVE_SUCCESS;
+			tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43;
+		}
+		kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
+	}
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags = kvmppc_get_gpr(vcpu, 4);
+	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+	unsigned long rb, pteg, r, v;
+	unsigned long pte[2];
+	long int ret;
+
+	pteg = get_pteg_addr(vcpu, pte_index);
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	ret = H_FUNCTION;
+	if (copy_from_user(pte, (void __user *)pteg, sizeof(pte)))
+		goto done;
+	pte[0] = be64_to_cpu((__force __be64)pte[0]);
+	pte[1] = be64_to_cpu((__force __be64)pte[1]);
+
+	ret = H_NOT_FOUND;
+	if ((pte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn))
+		goto done;
+
+	v = pte[0];
+	r = pte[1];
+	r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI |
+	       HPTE_R_KEY_LO);
+	r |= (flags << 55) & HPTE_R_PP0;
+	r |= (flags << 48) & HPTE_R_KEY_HI;
+	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	pte[1] = r;
+
+	rb = compute_tlbie_rb(v, r, pte_index);
+	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+	pte[0] = (__force u64)cpu_to_be64(pte[0]);
+	pte[1] = (__force u64)cpu_to_be64(pte[1]);
+	ret = H_FUNCTION;
+	if (copy_to_user((void __user *)pteg, pte, sizeof(pte)))
+		goto done;
+	ret = H_SUCCESS;
+
+ done:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_load(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_store(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_set_mode(struct kvm_vcpu *vcpu)
+{
+	unsigned long mflags = kvmppc_get_gpr(vcpu, 4);
+	unsigned long resource = kvmppc_get_gpr(vcpu, 5);
+
+	if (resource == H_SET_MODE_RESOURCE_ADDR_TRANS_MODE) {
+		/* KVM PR does not provide AIL!=0 to guests */
+		if (mflags == 0)
+			kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+		else
+			kvmppc_set_gpr(vcpu, 3, H_UNSUPPORTED_FLAG_START - 63);
+		return EMULATE_DONE;
+	}
+	return EMULATE_FAIL;
+}
+
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	long rc;
+
+	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba,
+			tce, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+#else /* CONFIG_SPAPR_TCE_IOMMU */
+static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+	return EMULATE_FAIL;
+}
+#endif /* CONFIG_SPAPR_TCE_IOMMU */
+
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+	long rc = kvmppc_xics_hcall(vcpu, cmd);
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
+{
+	int rc, idx;
+
+	if (cmd <= MAX_HCALL_OPCODE &&
+	    !test_bit(cmd/4, vcpu->kvm->arch.enabled_hcalls))
+		return EMULATE_FAIL;
+
+	switch (cmd) {
+	case H_ENTER:
+		return kvmppc_h_pr_enter(vcpu);
+	case H_REMOVE:
+		return kvmppc_h_pr_remove(vcpu);
+	case H_PROTECT:
+		return kvmppc_h_pr_protect(vcpu);
+	case H_BULK_REMOVE:
+		return kvmppc_h_pr_bulk_remove(vcpu);
+	case H_PUT_TCE:
+		return kvmppc_h_pr_put_tce(vcpu);
+	case H_PUT_TCE_INDIRECT:
+		return kvmppc_h_pr_put_tce_indirect(vcpu);
+	case H_STUFF_TCE:
+		return kvmppc_h_pr_stuff_tce(vcpu);
+	case H_CEDE:
+		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
+		kvm_vcpu_halt(vcpu);
+		vcpu->stat.generic.halt_wakeup++;
+		return EMULATE_DONE;
+	case H_LOGICAL_CI_LOAD:
+		return kvmppc_h_pr_logical_ci_load(vcpu);
+	case H_LOGICAL_CI_STORE:
+		return kvmppc_h_pr_logical_ci_store(vcpu);
+	case H_SET_MODE:
+		return kvmppc_h_pr_set_mode(vcpu);
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+		if (kvmppc_xics_enabled(vcpu))
+			return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+		break;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			break;
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		rc = kvmppc_rtas_hcall(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		if (rc)
+			break;
+		kvmppc_set_gpr(vcpu, 3, 0);
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_FAIL;
+}
+
+int kvmppc_hcall_impl_pr(unsigned long cmd)
+{
+	switch (cmd) {
+	case H_ENTER:
+	case H_REMOVE:
+	case H_PROTECT:
+	case H_BULK_REMOVE:
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case H_GET_TCE:
+	case H_PUT_TCE:
+	case H_PUT_TCE_INDIRECT:
+	case H_STUFF_TCE:
+#endif
+	case H_CEDE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
+	case H_SET_MODE:
+#ifdef CONFIG_KVM_XICS
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+#endif
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * List of hcall numbers to enable by default.
+ * For compatibility with old userspace, we enable by default
+ * all hcalls that were implemented before the hcall-enabling
+ * facility was added.  Note this list should not include H_RTAS.
+ */
+static unsigned int default_hcall_list[] = {
+	H_ENTER,
+	H_REMOVE,
+	H_PROTECT,
+	H_BULK_REMOVE,
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	H_GET_TCE,
+	H_PUT_TCE,
+#endif
+	H_CEDE,
+	H_SET_MODE,
+#ifdef CONFIG_KVM_XICS
+	H_XIRR,
+	H_CPPR,
+	H_EOI,
+	H_IPI,
+	H_IPOLL,
+	H_XIRR_X,
+#endif
+	0
+};
+
+void kvmppc_pr_init_default_hcalls(struct kvm *kvm)
+{
+	int i;
+	unsigned int hcall;
+
+	for (i = 0; default_hcall_list[i]; ++i) {
+		hcall = default_hcall_list[i];
+		WARN_ON(!kvmppc_hcall_impl_pr(hcall));
+		__set_bit(hcall / 4, kvm->arch.enabled_hcalls);
+	}
+}
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
new file mode 100644
index 0000000000..0a557ffca9
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/exception-64s.h>
+#endif
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in low physical memory          *
+ *                                                                           *
+ ****************************************************************************/
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+
+#ifdef CONFIG_PPC64_ELF_ABI_V2
+#define FUNC(name) 		name
+#else
+#define FUNC(name) 		GLUE(.,name)
+#endif
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define FUNC(name)		name
+
+#define RFI_TO_KERNEL	rfi
+#define RFI_TO_GUEST	rfi
+
+.macro INTERRUPT_TRAMPOLINE intno
+
+.global kvmppc_trampoline_\intno
+kvmppc_trampoline_\intno:
+
+	mtspr	SPRN_SPRG_SCRATCH0, r13		/* Save r13 */
+
+	/*
+	 * First thing to do is to find out if we're coming
+	 * from a KVM guest or a Linux process.
+	 *
+	 * To distinguish, we check a magic byte in the PACA/current
+	 */
+	mfspr	r13, SPRN_SPRG_THREAD
+	lwz	r13, THREAD_KVM_SVCPU(r13)
+	/* PPC32 can have a NULL pointer - let's check for that */
+	mtspr   SPRN_SPRG_SCRATCH1, r12		/* Save r12 */
+	mfcr	r12
+	cmpwi	r13, 0
+	bne	1f
+2:	mtcr	r12
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	mfspr	r13, SPRN_SPRG_SCRATCH0		/* r13 = original r13 */
+	b	kvmppc_resume_\intno		/* Get back original handler */
+
+1:	tophys(r13, r13)
+	stw	r12, HSTATE_SCRATCH1(r13)
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	stw	r12, HSTATE_SCRATCH0(r13)
+	lbz	r12, HSTATE_IN_GUEST(r13)
+	cmpwi	r12, KVM_GUEST_MODE_NONE
+	bne	..kvmppc_handler_hasmagic_\intno
+	/* No KVM guest? Then jump back to the Linux handler! */
+	lwz	r12, HSTATE_SCRATCH1(r13)
+	b	2b
+
+	/* Now we know we're handling a KVM guest */
+..kvmppc_handler_hasmagic_\intno:
+
+	/* Should we just skip the faulting instruction? */
+	cmpwi	r12, KVM_GUEST_MODE_SKIP
+	beq	kvmppc_handler_skip_ins
+
+	/* Let's store which interrupt we're handling */
+	li	r12, \intno
+
+	/* Jump into the SLB exit code that goes to the highmem handler */
+	b	kvmppc_handler_trampoline_exit
+
+.endm
+
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_SYSTEM_RESET
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_MACHINE_CHECK
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_STORAGE
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_STORAGE
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALIGNMENT
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PROGRAM
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_FP_UNAVAIL
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DECREMENTER
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_SYSCALL
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_TRACE
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PERFMON
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC
+
+/*
+ * Bring us back to the faulting code, but skip the
+ * faulting instruction.
+ *
+ * This is a generic exit path from the interrupt
+ * trampolines above.
+ *
+ * Input Registers:
+ *
+ * R12            = free
+ * R13            = Shadow VCPU (PACA)
+ * HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH1 = guest CR
+ * SPRG_SCRATCH0  = guest R13
+ *
+ */
+kvmppc_handler_skip_ins:
+
+	/* Patch the IP to the next instruction */
+	/* Note that prefixed instructions are disabled in PR KVM for now */
+	mfsrr0	r12
+	addi	r12, r12, 4
+	mtsrr0	r12
+
+	/* Clean up all state */
+	lwz	r12, HSTATE_SCRATCH1(r13)
+	mtcr	r12
+	PPC_LL	r12, HSTATE_SCRATCH0(r13)
+	GET_SCRATCH0(r13)
+
+	/* And get back into the code */
+	RFI_TO_KERNEL
+#endif
+
+/*
+ * Call kvmppc_handler_trampoline_enter in real mode
+ *
+ * On entry, r4 contains the guest shadow MSR
+ * MSR.EE has to be 0 when calling this function
+ */
+_GLOBAL_TOC(kvmppc_entry_trampoline)
+	mfmsr	r5
+	LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
+	toreal(r7)
+
+	li	r6, MSR_IR | MSR_DR
+	andc	r6, r5, r6	/* Clear DR and IR in MSR value */
+	/*
+	 * Set EE in HOST_MSR so that it's enabled when we get into our
+	 * C exit handler function.
+	 */
+	ori	r5, r5, MSR_EE
+	mtsrr0	r7
+	mtsrr1	r6
+	RFI_TO_KERNEL
+
+#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644
index 0000000000..6808bda0db
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <linux/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/rtas.h>
+#include <asm/xive.h>
+
+#ifdef CONFIG_KVM_XICS
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 3 || be32_to_cpu(args->nret) != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+	server = be32_to_cpu(args->args[1]);
+	priority = be32_to_cpu(args->args[2]);
+
+	if (xics_on_xive())
+		rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
+	else
+		rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 3) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+
+	server = priority = 0;
+	if (xics_on_xive())
+		rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
+	else
+		rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (rc) {
+		rc = -3;
+		goto out;
+	}
+
+	args->rets[1] = cpu_to_be32(server);
+	args->rets[2] = cpu_to_be32(priority);
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+
+static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+
+	if (xics_on_xive())
+		rc = kvmppc_xive_int_off(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+
+static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+
+	if (xics_on_xive())
+		rc = kvmppc_xive_int_on(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+#endif /* CONFIG_KVM_XICS */
+
+struct rtas_handler {
+	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
+	char *name;
+};
+
+static struct rtas_handler rtas_handlers[] = {
+#ifdef CONFIG_KVM_XICS
+	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+	{ .name = "ibm,int-off",  .handler = kvm_rtas_int_off },
+	{ .name = "ibm,int-on",   .handler = kvm_rtas_int_on },
+#endif
+};
+
+struct rtas_token_definition {
+	struct list_head list;
+	struct rtas_handler *handler;
+	u64 token;
+};
+
+static int rtas_name_matches(char *s1, char *s2)
+{
+	struct kvm_rtas_token_args args;
+	return !strncmp(s1, s2, sizeof(args.name));
+}
+
+static int rtas_token_undefine(struct kvm *kvm, char *name)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->arch.rtas_token_lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		if (rtas_name_matches(d->handler->name, name)) {
+			list_del(&d->list);
+			kfree(d);
+			return 0;
+		}
+	}
+
+	/* It's not an error to undefine an undefined token */
+	return 0;
+}
+
+static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
+{
+	struct rtas_token_definition *d;
+	struct rtas_handler *h = NULL;
+	bool found;
+	int i;
+
+	lockdep_assert_held(&kvm->arch.rtas_token_lock);
+
+	list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
+		if (d->token == token)
+			return -EEXIST;
+	}
+
+	found = false;
+	for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
+		h = &rtas_handlers[i];
+		if (rtas_name_matches(h->name, name)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return -ENOENT;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->handler = h;
+	d->token = token;
+
+	list_add_tail(&d->list, &kvm->arch.rtas_tokens);
+
+	return 0;
+}
+
+int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
+{
+	struct kvm_rtas_token_args args;
+	int rc;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	mutex_lock(&kvm->arch.rtas_token_lock);
+
+	if (args.token)
+		rc = rtas_token_define(kvm, args.name, args.token);
+	else
+		rc = rtas_token_undefine(kvm, args.name);
+
+	mutex_unlock(&kvm->arch.rtas_token_lock);
+
+	return rc;
+}
+
+int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
+{
+	struct rtas_token_definition *d;
+	struct rtas_args args;
+	rtas_arg_t *orig_rets;
+	gpa_t args_phys;
+	int rc;
+
+	/*
+	 * r4 contains the guest physical address of the RTAS args
+	 * Mask off the top 4 bits since this is a guest real address
+	 */
+	args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM;
+
+	kvm_vcpu_srcu_read_lock(vcpu);
+	rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+	kvm_vcpu_srcu_read_unlock(vcpu);
+	if (rc)
+		goto fail;
+
+	/*
+	 * args->rets is a pointer into args->args. Now that we've
+	 * copied args we need to fix it up to point into our copy,
+	 * not the guest args. We also need to save the original
+	 * value so we can restore it on the way out.
+	 */
+	orig_rets = args.rets;
+	if (be32_to_cpu(args.nargs) >= ARRAY_SIZE(args.args)) {
+		/*
+		 * Don't overflow our args array: ensure there is room for
+		 * at least rets[0] (even if the call specifies 0 nret).
+		 *
+		 * Each handler must then check for the correct nargs and nret
+		 * values, but they may always return failure in rets[0].
+		 */
+		rc = -EINVAL;
+		goto fail;
+	}
+	args.rets = &args.args[be32_to_cpu(args.nargs)];
+
+	mutex_lock(&vcpu->kvm->arch.rtas_token_lock);
+
+	rc = -ENOENT;
+	list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
+		if (d->token == be32_to_cpu(args.token)) {
+			d->handler->handler(vcpu, &args);
+			rc = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&vcpu->kvm->arch.rtas_token_lock);
+
+	if (rc == 0) {
+		args.rets = orig_rets;
+		rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+		if (rc)
+			goto fail;
+	}
+
+	return rc;
+
+fail:
+	/*
+	 * We only get here if the guest has called RTAS with a bogus
+	 * args pointer or nargs/nret values that would overflow the
+	 * array. That means we can't get to the args, and so we can't
+	 * fail the RTAS call. So fail right out to userspace, which
+	 * should kill the guest.
+	 *
+	 * SLOF should actually pass the hcall return value from the
+	 * rtas handler call in r3, so enter_rtas could be modified to
+	 * return a failure indication in r3 and we could return such
+	 * errors to the guest rather than failing to host userspace.
+	 * However old guests that don't test for failure could then
+	 * continue silently after errors, so for now we won't do this.
+	 */
+	return rc;
+}
+EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall);
+
+void kvmppc_rtas_tokens_free(struct kvm *kvm)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		list_del(&d->list);
+		kfree(d);
+	}
+}
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
new file mode 100644
index 0000000000..202046a83f
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -0,0 +1,412 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+/* Real mode helpers */
+
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+
+#define GET_SHADOW_VCPU(reg)    \
+	mr	reg, r13
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define GET_SHADOW_VCPU(reg)    			\
+	tophys(reg, r2);       			\
+	lwz     reg, (THREAD + THREAD_KVM_SVCPU)(reg);	\
+	tophys(reg, reg)
+
+#endif
+
+/* Disable for nested KVM */
+#define USE_QUICK_LAST_INST
+
+
+/* Get helper functions for subarch specific functionality */
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+#include "book3s_64_slb.S"
+#elif defined(CONFIG_PPC_BOOK3S_32)
+#include "book3s_32_sr.S"
+#endif
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_enter
+kvmppc_handler_trampoline_enter:
+
+	/* Required state:
+	 *
+	 * MSR = ~IR|DR
+	 * R1 = host R1
+	 * R2 = host R2
+	 * R4 = guest shadow MSR
+	 * R5 = normal host MSR
+	 * R6 = current host MSR (EE, IR, DR off)
+	 * LR = highmem guest exit code
+	 * all other volatile GPRS = free
+	 * SVCPU[CR] = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR] = guest LR
+	 */
+
+	/* r3 = shadow vcpu */
+	GET_SHADOW_VCPU(r3)
+
+	/* Save guest exit handler address and MSR */
+	mflr	r0
+	PPC_STL	r0, HSTATE_VMHANDLER(r3)
+	PPC_STL	r5, HSTATE_HOST_MSR(r3)
+
+	/* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
+	PPC_STL	r1, HSTATE_HOST_R1(r3)
+	PPC_STL	r2, HSTATE_HOST_R2(r3)
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r11, KVM_GUEST_MODE_GUEST
+	stb	r11, HSTATE_IN_GUEST(r3)
+
+	/* Switch to guest segment. This is subarch specific. */
+	LOAD_GUEST_SEGMENTS
+
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+	/* Save host FSCR */
+	mfspr	r8, SPRN_FSCR
+	std	r8, HSTATE_HOST_FSCR(r13)
+	/* Set FSCR during guest execution */
+	ld	r9, SVCPU_SHADOW_FSCR(r13)
+	mtspr	SPRN_FSCR, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	/* Some guests may need to have dcbz set to 32 byte length.
+	 *
+	 * Usually we ensure that by patching the guest's instructions
+	 * to trap on dcbz and emulate it in the hypervisor.
+	 *
+	 * If we can, we should tell the CPU to use 32 byte dcbz though,
+	 * because that's a lot faster.
+	 */
+	lbz	r0, HSTATE_RESTORE_HID5(r3)
+	cmpwi	r0, 0
+	beq	no_dcbz32_on
+
+	mfspr   r0,SPRN_HID5
+	ori     r0, r0, 0x80		/* XXX HID5_dcbz32 = 0x80 */
+	mtspr   SPRN_HID5,r0
+no_dcbz32_on:
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/* Enter guest */
+
+	PPC_LL	r8, SVCPU_CTR(r3)
+	PPC_LL	r9, SVCPU_LR(r3)
+	lwz	r10, SVCPU_CR(r3)
+	PPC_LL	r11, SVCPU_XER(r3)
+
+	mtctr	r8
+	mtlr	r9
+	mtcr	r10
+	mtxer	r11
+
+	/* Move SRR0 and SRR1 into the respective regs */
+	PPC_LL  r9, SVCPU_PC(r3)
+	/* First clear RI in our current MSR value */
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+
+	PPC_LL	r0, SVCPU_R0(r3)
+	PPC_LL	r1, SVCPU_R1(r3)
+	PPC_LL	r2, SVCPU_R2(r3)
+	PPC_LL	r5, SVCPU_R5(r3)
+	PPC_LL	r7, SVCPU_R7(r3)
+	PPC_LL	r8, SVCPU_R8(r3)
+	PPC_LL	r10, SVCPU_R10(r3)
+	PPC_LL	r11, SVCPU_R11(r3)
+	PPC_LL	r12, SVCPU_R12(r3)
+	PPC_LL	r13, SVCPU_R13(r3)
+
+	MTMSR_EERI(r6)
+	mtsrr0	r9
+	mtsrr1	r4
+
+	PPC_LL	r4, SVCPU_R4(r3)
+	PPC_LL	r6, SVCPU_R6(r3)
+	PPC_LL	r9, SVCPU_R9(r3)
+	PPC_LL	r3, (SVCPU_R3)(r3)
+
+	RFI_TO_GUEST
+kvmppc_handler_trampoline_enter_end:
+
+
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_interrupt_pr
+kvmppc_interrupt_pr:
+	/* 64-bit entry. Register usage at this point:
+	 *
+	 * SPRG_SCRATCH0   = guest R13
+	 * R9              = HSTATE_IN_GUEST
+	 * R12             = (guest CR << 32) | exit handler id
+	 * R13             = PACA
+	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH2 = guest R9
+	 */
+#ifdef CONFIG_PPC64
+	/* Match 32-bit entry */
+	ld	r9,HSTATE_SCRATCH2(r13)
+	rotldi	r12, r12, 32		  /* Flip R12 halves for stw */
+	stw	r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
+	srdi	r12, r12, 32		  /* shift trap into low half */
+#endif
+
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
+	/* Register usage at this point:
+	 *
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = exit handler id
+	 * R13             = shadow vcpu (32-bit) or PACA (64-bit)
+	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CR
+	 */
+
+	/* Save registers */
+
+	PPC_STL	r0, SVCPU_R0(r13)
+	PPC_STL	r1, SVCPU_R1(r13)
+	PPC_STL	r2, SVCPU_R2(r13)
+	PPC_STL	r3, SVCPU_R3(r13)
+	PPC_STL	r4, SVCPU_R4(r13)
+	PPC_STL	r5, SVCPU_R5(r13)
+	PPC_STL	r6, SVCPU_R6(r13)
+	PPC_STL	r7, SVCPU_R7(r13)
+	PPC_STL	r8, SVCPU_R8(r13)
+	PPC_STL	r9, SVCPU_R9(r13)
+	PPC_STL	r10, SVCPU_R10(r13)
+	PPC_STL	r11, SVCPU_R11(r13)
+
+	/* Restore R1/R2 so we can handle faults */
+	PPC_LL	r1, HSTATE_HOST_R1(r13)
+	PPC_LL	r2, HSTATE_HOST_R2(r13)
+
+	/* Save guest PC and MSR */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	andi.	r0, r12, 0x2
+	cmpwi	cr1, r0, 0
+	beq	1f
+	mfspr	r3,SPRN_HSRR0
+	mfspr	r4,SPRN_HSRR1
+	andi.	r12,r12,0x3ffd
+	b	2f
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+#endif
+1:	mfsrr0	r3
+	mfsrr1	r4
+2:
+	PPC_STL	r3, SVCPU_PC(r13)
+	PPC_STL	r4, SVCPU_SHADOW_SRR1(r13)
+
+	/* Get scratch'ed off registers */
+	GET_SCRATCH0(r9)
+	PPC_LL	r8, HSTATE_SCRATCH0(r13)
+	lwz	r7, HSTATE_SCRATCH1(r13)
+
+	PPC_STL	r9, SVCPU_R13(r13)
+	PPC_STL	r8, SVCPU_R12(r13)
+	stw	r7, SVCPU_CR(r13)
+
+	/* Save more register state  */
+
+	mfxer	r5
+	mfdar	r6
+	mfdsisr	r7
+	mfctr	r8
+	mflr	r9
+
+	PPC_STL	r5, SVCPU_XER(r13)
+	PPC_STL	r6, SVCPU_FAULT_DAR(r13)
+	stw	r7, SVCPU_FAULT_DSISR(r13)
+	PPC_STL	r8, SVCPU_CTR(r13)
+	PPC_STL	r9, SVCPU_LR(r13)
+
+	/*
+	 * In order for us to easily get the last instruction,
+	 * we got the #vmexit at, we exploit the fact that the
+	 * virtual layout is still the same here, so we can just
+	 * ld from the guest's PC address
+	 */
+
+	/* We only load the last instruction when it's safe */
+	cmpwi	r12, BOOK3S_INTERRUPT_DATA_STORAGE
+	beq	ld_last_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_PROGRAM
+	beq	ld_last_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_SYSCALL
+	beq	ld_last_prev_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_ALIGNMENT
+	beq-	ld_last_inst
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	cmpwi	r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	beq-	ld_last_inst
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+BEGIN_FTR_SECTION
+	cmpwi	r12, BOOK3S_INTERRUPT_FAC_UNAVAIL
+	beq-	ld_last_inst
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+#endif
+
+	b	no_ld_last_inst
+
+ld_last_prev_inst:
+	addi	r3, r3, -4
+
+ld_last_inst:
+	/* Save off the guest instruction we're at */
+
+	/* In case lwz faults */
+	li	r0, KVM_INST_FETCH_FAILED
+
+#ifdef USE_QUICK_LAST_INST
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r9, KVM_GUEST_MODE_SKIP
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/*    1) enable paging for data */
+	mfmsr	r9
+	ori	r11, r9, MSR_DR			/* Enable paging for data */
+	mtmsr	r11
+	sync
+	/*    2) fetch the instruction */
+	lwz	r0, 0(r3)
+	/*    3) disable paging again */
+	mtmsr	r9
+	sync
+
+#endif
+	stw	r0, SVCPU_LAST_INST(r13)
+
+no_ld_last_inst:
+
+	/* Unset guest mode */
+	li	r9, KVM_GUEST_MODE_NONE
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/* Switch back to host MMU */
+	LOAD_HOST_SEGMENTS
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+	lbz	r5, HSTATE_RESTORE_HID5(r13)
+	cmpwi	r5, 0
+	beq	no_dcbz32_off
+
+	li	r4, 0
+	mfspr   r5,SPRN_HID5
+	rldimi  r5,r4,6,56
+	mtspr   SPRN_HID5,r5
+
+no_dcbz32_off:
+
+BEGIN_FTR_SECTION
+	/* Save guest FSCR on a FAC_UNAVAIL interrupt */
+	cmpwi	r12, BOOK3S_INTERRUPT_FAC_UNAVAIL
+	bne+	no_fscr_save
+	mfspr	r7, SPRN_FSCR
+	std	r7, SVCPU_SHADOW_FSCR(r13)
+no_fscr_save:
+	/* Restore host FSCR */
+	ld	r8, HSTATE_HOST_FSCR(r13)
+	mtspr	SPRN_FSCR, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/*
+	 * For some interrupts, we need to call the real Linux
+	 * handler, so it can do work for us. This has to happen
+	 * as if the interrupt arrived from the kernel though,
+	 * so let's fake it here where most state is restored.
+	 *
+	 * Having set up SRR0/1 with the address where we want
+	 * to continue with relocation on (potentially in module
+	 * space), we either just go straight there with rfi[d],
+	 * or we jump to an interrupt handler if there is an
+	 * interrupt to be handled first.  In the latter case,
+	 * the rfi[d] at the end of the interrupt handler will
+	 * get us back to where we want to continue.
+	 */
+
+	/* Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R10      = raw exit handler id
+	 * R12      = exit handler id
+	 * R13      = shadow vcpu (32-bit) or PACA (64-bit)
+	 * SVCPU.*  = guest *
+	 *
+	 */
+
+	PPC_LL	r6, HSTATE_HOST_MSR(r13)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/*
+	 * We don't want to change MSR[TS] bits via rfi here.
+	 * The actual TM handling logic will be in host with
+	 * recovered DR/IR bits after HSTATE_VMHANDLER.
+	 * And MSR_TM can be enabled in HOST_MSR so rfid may
+	 * not suppress this change and can lead to exception.
+	 * Manually set MSR to prevent TS state change here.
+	 */
+	mfmsr   r7
+	rldicl  r7, r7, 64 - MSR_TS_S_LG, 62
+	rldimi  r6, r7, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+#endif
+	PPC_LL	r8, HSTATE_VMHANDLER(r13)
+
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	beq	cr1, 1f
+	mtspr	SPRN_HSRR1, r6
+	mtspr	SPRN_HSRR0, r8
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+#endif
+1:	/* Restore host msr -> SRR1 */
+	mtsrr1	r6
+	/* Load highmem handler address */
+	mtsrr0	r8
+
+	/* RFI into the highmem handler, or jump to interrupt handler */
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beqa	BOOK3S_INTERRUPT_EXTERNAL
+	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER
+	beqa	BOOK3S_INTERRUPT_DECREMENTER
+	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
+	beqa	BOOK3S_INTERRUPT_PERFMON
+	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
+	beqa	BOOK3S_INTERRUPT_DOORBELL
+
+	RFI_TO_KERNEL
+kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 0000000000..589a8f2571
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1507 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/anon_inodes.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/time.h>
+
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#if 1
+#define XICS_DBG(fmt...) do { } while (0)
+#else
+#define XICS_DBG(fmt...) trace_printk(fmt)
+#endif
+
+#define ENABLE_REALMODE	true
+#define DEBUG_REALMODE	false
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a spin lock protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries of the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq, bool check_resend);
+
+/*
+ * Return value ideally indicates how the interrupt was handled, but no
+ * callers look at it (given that we don't implement KVM_IRQ_LINE_STATUS),
+ * so just return 0.
+ */
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u16 src;
+	u32 pq_old, pq_new;
+
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		return -EINVAL;
+	}
+	state = &ics->irq_state[src];
+	if (!state->exists)
+		return -EINVAL;
+
+	if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
+		level = 1;
+	else if (level == KVM_INTERRUPT_UNSET)
+		level = 0;
+	/*
+	 * Take other values the same as 1, consistent with original code.
+	 * maybe WARN here?
+	 */
+
+	if (!state->lsi && level == 0) /* noop for MSI */
+		return 0;
+
+	do {
+		pq_old = state->pq_state;
+		if (state->lsi) {
+			if (level) {
+				if (pq_old & PQ_PRESENTED)
+					/* Setting already set LSI ... */
+					return 0;
+
+				pq_new = PQ_PRESENTED;
+			} else
+				pq_new = 0;
+		} else
+			pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+	} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	/* Test P=1, Q=0, this is the only case where we present */
+	if (pq_new == PQ_PRESENTED)
+		icp_deliver_irq(xics, NULL, irq, false);
+
+	/* Record which CPU this arrived on for passed-through interrupts */
+	if (state->host_irq)
+		state->intr_cpu = raw_smp_processor_id();
+
+	return 0;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+			     struct kvmppc_icp *icp)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+		if (state->resend) {
+			XICS_DBG("resend %#x prio %#x\n", state->number,
+				      state->priority);
+			icp_deliver_irq(xics, icp, state->number, true);
+		}
+	}
+}
+
+static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+		       struct ics_irq_state *state,
+		       u32 server, u32 priority, u32 saved_priority)
+{
+	bool deliver;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+
+	state->server = server;
+	state->priority = priority;
+	state->saved_priority = saved_priority;
+	deliver = false;
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
+		state->masked_pending = 0;
+		state->resend = 0;
+		deliver = true;
+	}
+
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	return deliver;
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, server);
+	if (!icp)
+		return -EINVAL;
+
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
+	if (write_xive(xics, ics, state, server, priority, priority))
+		icp_deliver_irq(xics, icp, irq, false);
+
+	return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	unsigned long flags;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+	*server = state->server;
+	*priority = state->priority;
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, state->server);
+	if (!icp)
+		return -EINVAL;
+
+	if (write_xive(xics, ics, state, state->server, state->saved_priority,
+		       state->saved_priority))
+		icp_deliver_irq(xics, icp, irq, false);
+
+	return 0;
+}
+
+int kvmppc_xics_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	write_xive(xics, ics, state, state->server, MASKED, state->priority);
+
+	return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
+{
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04lx] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->server_num,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL);
+		if (!change_self)
+			kvmppc_fast_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#lx\n", irq, priority,
+		 icp->server_num);
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq, bool check_resend)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+	unsigned long flags;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
+				new_irq, state->server);
+			goto out;
+		}
+	}
+
+	if (check_resend)
+		if (!state->resend)
+			goto out;
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differentiate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * ics spin lock.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have already consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
+			new_irq = reject;
+			check_resend = false;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		state->resend = 1;
+
+		/*
+		 * Make sure when checking resend, we don't miss the resend
+		 * if resend_map bit is seen and cleared.
+		 */
+		smp_wmb();
+		set_bit(ics->icsid, icp->resend_map);
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			state->resend = 0;
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
+			check_resend = false;
+			goto again;
+		}
+	}
+ out:
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+	return xirr;
+}
+
+static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+				 unsigned long mfrr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+		 vcpu->vcpu_id, server, mfrr);
+
+	icp = vcpu->arch.icp;
+	local = icp->server_num == server;
+	if (!local) {
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+		if (!icp)
+			return H_PARAMETER;
+	}
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be rejected as there can be no XISR to
+	 * reject.  If the MFRR is being made less favored then
+	 * there might be a previously-rejected interrupt needing
+	 * to be resent.
+	 *
+	 * ICP state: Check_IPI
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it.
+	 *
+	 * ICP State: IPI
+	 *
+	 * Besides rejecting any pending interrupts, we also
+	 * update XISR and pending_pri to mark IPI as pending.
+	 *
+	 * PAPR does not describe this state, but if the MFRR is being
+	 * made less favored than its earlier value, there might be
+	 * a previously-rejected interrupt needing to be resent.
+	 * Ideally, we would want to resend only if
+	 *	prio(pending_interrupt) < mfrr &&
+	 *	prio(pending_interrupt) < cppr
+	 * where pending interrupt is the one that was rejected. But
+	 * we don't have that state, so we simply trigger a resend
+	 * whenever the MFRR is made less favored.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri) {
+				reject = new_state.xisr;
+				new_state.pending_pri = mfrr;
+				new_state.xisr = XICS_IPI;
+			}
+		}
+
+		if (mfrr > old_state.mfrr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, local));
+
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject, false);
+
+	/* Handle resend */
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return H_SUCCESS;
+}
+
+static int kvmppc_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	union kvmppc_icp_state state;
+	struct kvmppc_icp *icp;
+
+	icp = vcpu->arch.icp;
+	if (icp->server_num != server) {
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+		if (!icp)
+			return H_PARAMETER;
+	}
+	state = READ_ONCE(icp->state);
+	kvmppc_set_gpr(vcpu, 4, ((u32)state.cppr << 24) | state.xisr);
+	kvmppc_set_gpr(vcpu, 5, state.mfrr);
+	return H_SUCCESS;
+}
+
+static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject, false);
+}
+
+static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	u32 pq_old, pq_new;
+
+	/*
+	 * ICS EOI handling: For LSI, if P bit is still set, we need to
+	 * resend it.
+	 *
+	 * For MSI, we move Q bit into P (and clear Q). If it is set,
+	 * resend it.
+	 */
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	if (state->lsi)
+		pq_new = state->pq_state;
+	else
+		do {
+			pq_old = state->pq_state;
+			pq_new = pq_old >> 1;
+		} while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+	if (pq_new & PQ_PRESENTED)
+		icp_deliver_irq(xics, icp, irq, false);
+
+	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+	return H_SUCCESS;
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 irq = xirr & 0x00ffffff;
+
+	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR specifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+
+	return ics_eoi(vcpu, irq);
+}
+
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+
+	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
+		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
+
+	if (icp->rm_action & XICS_RM_KICK_VCPU) {
+		icp->n_rm_kick_vcpu++;
+		kvmppc_fast_vcpu_kick(icp->rm_kick_target);
+	}
+	if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+		icp->n_rm_check_resend++;
+		icp_check_resend(xics, icp->rm_resend_icp);
+	}
+	if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+		icp->n_rm_notify_eoi++;
+		kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+	}
+
+	icp->rm_action = 0;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	unsigned long res;
+	int rc = H_SUCCESS;
+
+	/* Check if we have an ICP */
+	if (!xics || !vcpu->arch.icp)
+		return H_HARDWARE;
+
+	/* These requests don't have real-mode implementations at present */
+	switch (req) {
+	case H_XIRR_X:
+		res = kvmppc_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		kvmppc_set_gpr(vcpu, 5, get_tb());
+		return rc;
+	case H_IPOLL:
+		rc = kvmppc_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
+		return rc;
+	}
+
+	/* Check for real mode returning too hard */
+	if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm))
+		return kvmppc_xics_rm_complete(vcpu, req);
+
+	switch (req) {
+	case H_XIRR:
+		res = kvmppc_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		break;
+	case H_CPPR:
+		kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_EOI:
+		rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_IPI:
+		rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+				  kvmppc_get_gpr(vcpu, 5));
+		break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
+
+
+/* -- Initialisation code etc. -- */
+
+static void xics_debugfs_irqmap(struct seq_file *m,
+				struct kvmppc_passthru_irqmap *pimap)
+{
+	int i;
+
+	if (!pimap)
+		return;
+	seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+				pimap->n_mapped);
+	for (i = 0; i < pimap->n_mapped; i++)  {
+		seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+			pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+	}
+}
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xics *xics = m->private;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+	int icsid;
+	unsigned long flags, i;
+	unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+	unsigned long t_rm_notify_eoi;
+	unsigned long t_reject, t_check_resend;
+
+	if (!kvm)
+		return 0;
+
+	t_rm_kick_vcpu = 0;
+	t_rm_notify_eoi = 0;
+	t_rm_check_resend = 0;
+	t_check_resend = 0;
+	t_reject = 0;
+
+	xics_debugfs_irqmap(m, kvm->arch.pimap);
+
+	seq_printf(m, "=========\nICP state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
+
+		if (!icp)
+			continue;
+
+		state.raw = READ_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
+			   icp->server_num, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
+		t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+		t_rm_notify_eoi += icp->n_rm_notify_eoi;
+		t_rm_check_resend += icp->n_rm_check_resend;
+		t_check_resend += icp->n_check_resend;
+		t_reject += icp->n_reject;
+	}
+
+	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
+			t_rm_kick_vcpu, t_rm_check_resend,
+			t_rm_notify_eoi);
+	seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+			t_check_resend, t_reject);
+	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!ics)
+			continue;
+
+		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+			   icsid);
+
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
+
+		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+			struct ics_irq_state *irq = &ics->irq_state[i];
+
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
+				   irq->number, irq->server, irq->priority,
+				   irq->saved_priority, irq->pq_state,
+				   irq->resend, irq->masked_pending);
+
+		}
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
+	}
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(xics_debug);
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+	xics->dentry = debugfs_create_file("xics", 0444, xics->kvm->debugfs_dentry,
+					   xics, &xics_debug_fops);
+
+	pr_debug("%s: created\n", __func__);
+}
+
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+					struct kvmppc_xics *xics, int irq)
+{
+	struct kvmppc_ics *ics;
+	int i, icsid;
+
+	icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&kvm->lock);
+
+	/* ICS already exists - somebody else got here first */
+	if (xics->ics[icsid])
+		goto out;
+
+	/* Create the ICS */
+	ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+	if (!ics)
+		goto out;
+
+	ics->icsid = icsid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+		ics->irq_state[i].priority = MASKED;
+		ics->irq_state[i].saved_priority = MASKED;
+	}
+	smp_wmb();
+	xics->ics[icsid] = ics;
+
+	if (icsid > xics->max_icsid)
+		xics->max_icsid = icsid;
+
+ out:
+	mutex_unlock(&kvm->lock);
+	return xics->ics[icsid];
+}
+
+static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+	struct kvmppc_icp *icp;
+
+	if (!vcpu->kvm->arch.xics)
+		return -ENODEV;
+
+	if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+		return -EEXIST;
+
+	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+	if (!icp)
+		return -ENOMEM;
+
+	icp->vcpu = vcpu;
+	icp->server_num = server_num;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
+	vcpu->arch.icp = icp;
+
+	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+	return 0;
+}
+
+u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	union kvmppc_icp_state state;
+
+	if (!icp)
+		return 0;
+	state = icp->state;
+	return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) |
+		((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) |
+		((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) |
+		((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT);
+}
+
+int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_ics *ics;
+	u8 cppr, mfrr, pending_pri;
+	u32 xisr;
+	u16 src;
+	bool resend;
+
+	if (!icp || !xics)
+		return -ENOENT;
+
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+	pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT;
+
+	/* Require the new state to be internally consistent */
+	if (xisr == 0) {
+		if (pending_pri != 0xff)
+			return -EINVAL;
+	} else if (xisr == XICS_IPI) {
+		if (pending_pri != mfrr || pending_pri >= cppr)
+			return -EINVAL;
+	} else {
+		if (pending_pri >= mfrr || pending_pri >= cppr)
+			return -EINVAL;
+		ics = kvmppc_xics_find_ics(xics, xisr, &src);
+		if (!ics)
+			return -EINVAL;
+	}
+
+	new_state.raw = 0;
+	new_state.cppr = cppr;
+	new_state.xisr = xisr;
+	new_state.mfrr = mfrr;
+	new_state.pending_pri = pending_pri;
+
+	/*
+	 * Deassert the CPU interrupt request.
+	 * icp_try_update will reassert it if necessary.
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+
+	/*
+	 * Note that if we displace an interrupt from old_state.xisr,
+	 * we don't mark it as rejected.  We expect userspace to set
+	 * the state of the interrupt sources to be consistent with
+	 * the ICP states (either before or afterwards, which doesn't
+	 * matter).  We do handle resends due to CPPR becoming less
+	 * favoured because that is necessary to end up with a
+	 * consistent state in the situation where userspace restores
+	 * the ICS states before the ICP states.
+	 */
+	do {
+		old_state = READ_ONCE(icp->state);
+
+		if (new_state.mfrr <= old_state.mfrr) {
+			resend = false;
+			new_state.need_resend = old_state.need_resend;
+		} else {
+			resend = old_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return 0;
+}
+
+static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	int ret;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val, prio;
+	unsigned long flags;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return -ENOENT;
+
+	irqp = &ics->irq_state[idx];
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+	ret = -ENOENT;
+	if (irqp->exists) {
+		val = irqp->server;
+		prio = irqp->priority;
+		if (prio == MASKED) {
+			val |= KVM_XICS_MASKED;
+			prio = irqp->saved_priority;
+		}
+		val |= prio << KVM_XICS_PRIORITY_SHIFT;
+		if (irqp->lsi) {
+			val |= KVM_XICS_LEVEL_SENSITIVE;
+			if (irqp->pq_state & PQ_PRESENTED)
+				val |= KVM_XICS_PENDING;
+		} else if (irqp->masked_pending || irqp->resend)
+			val |= KVM_XICS_PENDING;
+
+		if (irqp->pq_state & PQ_PRESENTED)
+			val |= KVM_XICS_PRESENTED;
+
+		if (irqp->pq_state & PQ_QUEUED)
+			val |= KVM_XICS_QUEUED;
+
+		ret = 0;
+	}
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	if (!ret && put_user(val, ubufp))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 prio;
+	u32 server;
+	unsigned long flags;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics) {
+		ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
+		if (!ics)
+			return -ENOMEM;
+	}
+	irqp = &ics->irq_state[idx];
+	if (get_user(val, ubufp))
+		return -EFAULT;
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	prio = val >> KVM_XICS_PRIORITY_SHIFT;
+	if (prio != MASKED &&
+	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+	irqp->server = server;
+	irqp->saved_priority = prio;
+	if (val & KVM_XICS_MASKED)
+		prio = MASKED;
+	irqp->priority = prio;
+	irqp->resend = 0;
+	irqp->masked_pending = 0;
+	irqp->lsi = 0;
+	irqp->pq_state = 0;
+	if (val & KVM_XICS_LEVEL_SENSITIVE)
+		irqp->lsi = 1;
+	/* If PENDING, set P in case P is not saved because of old code */
+	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+		irqp->pq_state |= PQ_PRESENTED;
+	if (val & KVM_XICS_QUEUED)
+		irqp->pq_state |= PQ_QUEUED;
+	irqp->exists = 1;
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	if (val & KVM_XICS_PENDING)
+		icp_deliver_irq(xics, NULL, irqp->number, false);
+
+	return 0;
+}
+
+int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+
+	if (!xics)
+		return -ENODEV;
+	return ics_deliver_irq(xics, irq, level);
+}
+
+static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_set_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_get_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xics_release(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = dev->private;
+	unsigned long i;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+
+	pr_devel("Releasing xics device\n");
+
+	/*
+	 * Since this is the device release function, we know that
+	 * userspace does not have any open fd referring to the
+	 * device.  Therefore there can not be any of the device
+	 * attribute set/get functions being executed concurrently,
+	 * and similarly, the connect_vcpu and set/clr_mapped
+	 * functions also cannot be being executed.
+	 */
+
+	debugfs_remove(xics->dentry);
+
+	/*
+	 * We should clean up the vCPU interrupt presenters first.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/*
+		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+		 * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently.
+		 * Holding the vcpu->mutex also means that execution is
+		 * excluded for the vcpu until the ICP was freed. When the vcpu
+		 * can execute again, vcpu->arch.icp and vcpu->arch.irq_type
+		 * have been cleared and the vcpu will not be going into the
+		 * XICS code anymore.
+		 */
+		mutex_lock(&vcpu->mutex);
+		kvmppc_xics_free_icp(vcpu);
+		mutex_unlock(&vcpu->mutex);
+	}
+
+	if (kvm)
+		kvm->arch.xics = NULL;
+
+	for (i = 0; i <= xics->max_icsid; i++) {
+		kfree(xics->ics[i]);
+		xics->ics[i] = NULL;
+	}
+	/*
+	 * A reference of the kvmppc_xics pointer is now kept under
+	 * the xics_device pointer of the machine for reuse. It is
+	 * freed when the VM is destroyed for now until we fix all the
+	 * execution paths.
+	 */
+	kfree(dev);
+}
+
+static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm)
+{
+	struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device;
+	struct kvmppc_xics *xics = *kvm_xics_device;
+
+	if (!xics) {
+		xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+		*kvm_xics_device = xics;
+	} else {
+		memset(xics, 0, sizeof(*xics));
+	}
+
+	return xics;
+}
+
+static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xics *xics;
+	struct kvm *kvm = dev->kvm;
+
+	pr_devel("Creating xics for partition\n");
+
+	/* Already there ? */
+	if (kvm->arch.xics)
+		return -EEXIST;
+
+	xics = kvmppc_xics_get_device(kvm);
+	if (!xics)
+		return -ENOMEM;
+
+	dev->private = xics;
+	xics->dev = dev;
+	xics->kvm = kvm;
+	kvm->arch.xics = xics;
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+	    cpu_has_feature(CPU_FTR_HVMODE)) {
+		/* Enable real mode support */
+		xics->real_mode = ENABLE_REALMODE;
+		xics->real_mode_dbg = DEBUG_REALMODE;
+	}
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+	return 0;
+}
+
+static void kvmppc_xics_init(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	xics_debugfs_init(xics);
+}
+
+struct kvm_device_ops kvm_xics_ops = {
+	.name = "kvm-xics",
+	.create = kvmppc_xics_create,
+	.init = kvmppc_xics_init,
+	.release = kvmppc_xics_release,
+	.set_attr = xics_set_attr,
+	.get_attr = xics_get_attr,
+	.has_attr = xics_has_attr,
+};
+
+int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 xcpu)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int r = -EBUSY;
+
+	if (dev->ops != &kvm_xics_ops)
+		return -EPERM;
+	if (xics->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
+		return -EBUSY;
+
+	r = kvmppc_xics_create_icp(vcpu, xcpu);
+	if (!r)
+		vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+
+	return r;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.icp)
+		return;
+	kfree(vcpu->arch.icp);
+	vcpu->arch.icp = NULL;
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+			    unsigned long host_irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	u16 idx;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return;
+
+	ics->irq_state[idx].host_irq = host_irq;
+	ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+			    unsigned long host_irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	u16 idx;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return;
+
+	ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 0000000000..08fb0843fa
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+#ifdef CONFIG_KVM_XICS
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID	1023
+#define KVMPPC_XICS_ICS_SHIFT	10
+#define KVMPPC_XICS_IRQ_PER_ICS	(1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK	(KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ	16
+#define KVMPPC_XICS_NR_IRQS	((KVMPPC_XICS_MAX_ICS_ID + 1) * \
+				 KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED	0xff
+
+#define PQ_PRESENTED	1
+#define PQ_QUEUED	2
+
+/* State for one irq source */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u32 pq_state;
+	u8  priority;
+	u8  saved_priority;
+	u8  resend;
+	u8  masked_pending;
+	u8  lsi;		/* level-sensitive interrupt */
+	u8  exists;
+	int intr_cpu;
+	u32 host_irq;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee:1;
+		u8 need_resend:1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE	(KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	unsigned long server_num;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+
+	/* Real mode might find something too hard, here's the action
+	 * it might request from virtual mode
+	 */
+#define XICS_RM_KICK_VCPU	0x1
+#define XICS_RM_CHECK_RESEND	0x2
+#define XICS_RM_NOTIFY_EOI	0x8
+	u32 rm_action;
+	struct kvm_vcpu *rm_kick_target;
+	struct kvmppc_icp *rm_resend_icp;
+	u32  rm_reject;
+	u32  rm_eoied_irq;
+
+	/* Counters for each reason we exited real mode */
+	unsigned long n_rm_kick_vcpu;
+	unsigned long n_rm_check_resend;
+	unsigned long n_rm_notify_eoi;
+	/* Counters for handling ICP processing in real mode */
+	unsigned long n_check_resend;
+	unsigned long n_reject;
+
+	/* Debug stuff for real mode */
+	union kvmppc_icp_state rm_dbgstate;
+	struct kvm_vcpu *rm_dbgtgt;
+};
+
+struct kvmppc_ics {
+	arch_spinlock_t lock;
+	u16 icsid;
+	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+	u32 max_icsid;
+	bool real_mode;
+	bool real_mode_dbg;
+	u32 err_noics;
+	u32 err_noicp;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+							 u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+						      u32 irq, u16 *source)
+{
+	u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (source)
+		*source = src;
+	if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	ics = xics->ics[icsid];
+	if (!ics)
+		return NULL;
+	return ics;
+}
+
+extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern unsigned long xics_rm_h_xirr_x(struct kvm_vcpu *vcpu);
+extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+#endif /* CONFIG_KVM_XICS */
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
new file mode 100644
index 0000000000..f4115819e7
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -0,0 +1,2982 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "xive-kvm: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/uaccess.h>
+#include <linux/irqdomain.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/debug.h>
+#include <asm/time.h>
+#include <asm/opal.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xive.h"
+
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
+
+/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */
+#define XICS_DUMMY	1
+
+static void xive_vm_ack_pending(struct kvmppc_xive_vcpu *xc)
+{
+	u8 cppr;
+	u16 ack;
+
+	/*
+	 * Ensure any previous store to CPPR is ordered vs.
+	 * the subsequent loads from PIPR or ACK.
+	 */
+	eieio();
+
+	/* Perform the acknowledge OS to register cycle. */
+	ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG));
+
+	/* Synchronize subsequent queue accesses */
+	mb();
+
+	/* XXX Check grouping level */
+
+	/* Anything ? */
+	if (!((ack >> 8) & TM_QW1_NSR_EO))
+		return;
+
+	/* Grab CPPR of the most favored pending interrupt */
+	cppr = ack & 0xff;
+	if (cppr < 8)
+		xc->pending |= 1 << cppr;
+
+	/* Check consistency */
+	if (cppr >= xc->hw_cppr)
+		pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
+			smp_processor_id(), cppr, xc->hw_cppr);
+
+	/*
+	 * Update our image of the HW CPPR. We don't yet modify
+	 * xc->cppr, this will be done as we scan for interrupts
+	 * in the queues.
+	 */
+	xc->hw_cppr = cppr;
+}
+
+static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		offset |= XIVE_ESB_LD_ST_MO;
+
+	val = __raw_readq(__x_eoi_page(xd) + offset);
+#ifdef __LITTLE_ENDIAN__
+	val >>= 64-8;
+#endif
+	return (u8)val;
+}
+
+
+static void xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
+{
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		__raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI);
+	else if (xd->flags & XIVE_IRQ_FLAG_LSI) {
+		/*
+		 * For LSIs the HW EOI cycle is used rather than PQ bits,
+		 * as they are automatically re-triggred in HW when still
+		 * pending.
+		 */
+		__raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
+	} else {
+		uint64_t eoi_val;
+
+		/*
+		 * Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q,
+		 * except for LSIs where we use the "EOI cycle" special
+		 * load.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthetizing an interrupt in software
+		 */
+		eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00);
+
+		/* Re-trigger if needed */
+		if ((eoi_val & 1) && __x_trig_page(xd))
+			__raw_writeq(0, __x_trig_page(xd));
+	}
+}
+
+enum {
+	scan_fetch,
+	scan_poll,
+	scan_eoi,
+};
+
+static u32 xive_vm_scan_interrupts(struct kvmppc_xive_vcpu *xc,
+				       u8 pending, int scan_type)
+{
+	u32 hirq = 0;
+	u8 prio = 0xff;
+
+	/* Find highest pending priority */
+	while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
+		struct xive_q *q;
+		u32 idx, toggle;
+		__be32 *qpage;
+
+		/*
+		 * If pending is 0 this will return 0xff which is what
+		 * we want
+		 */
+		prio = ffs(pending) - 1;
+
+		/* Don't scan past the guest cppr */
+		if (prio >= xc->cppr || prio > 7) {
+			if (xc->mfrr < xc->cppr) {
+				prio = xc->mfrr;
+				hirq = XICS_IPI;
+			}
+			break;
+		}
+
+		/* Grab queue and pointers */
+		q = &xc->queues[prio];
+		idx = q->idx;
+		toggle = q->toggle;
+
+		/*
+		 * Snapshot the queue page. The test further down for EOI
+		 * must use the same "copy" that was used by __xive_read_eq
+		 * since qpage can be set concurrently and we don't want
+		 * to miss an EOI.
+		 */
+		qpage = READ_ONCE(q->qpage);
+
+skip_ipi:
+		/*
+		 * Try to fetch from the queue. Will return 0 for a
+		 * non-queueing priority (ie, qpage = 0).
+		 */
+		hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
+
+		/*
+		 * If this was a signal for an MFFR change done by
+		 * H_IPI we skip it. Additionally, if we were fetching
+		 * we EOI it now, thus re-enabling reception of a new
+		 * such signal.
+		 *
+		 * We also need to do that if prio is 0 and we had no
+		 * page for the queue. In this case, we have non-queued
+		 * IPI that needs to be EOId.
+		 *
+		 * This is safe because if we have another pending MFRR
+		 * change that wasn't observed above, the Q bit will have
+		 * been set and another occurrence of the IPI will trigger.
+		 */
+		if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
+			if (scan_type == scan_fetch) {
+				xive_vm_source_eoi(xc->vp_ipi,
+						       &xc->vp_ipi_data);
+				q->idx = idx;
+				q->toggle = toggle;
+			}
+			/* Loop back on same queue with updated idx/toggle */
+			WARN_ON(hirq && hirq != XICS_IPI);
+			if (hirq)
+				goto skip_ipi;
+		}
+
+		/* If it's the dummy interrupt, continue searching */
+		if (hirq == XICS_DUMMY)
+			goto skip_ipi;
+
+		/* Clear the pending bit if the queue is now empty */
+		if (!hirq) {
+			pending &= ~(1 << prio);
+
+			/*
+			 * Check if the queue count needs adjusting due to
+			 * interrupts being moved away.
+			 */
+			if (atomic_read(&q->pending_count)) {
+				int p = atomic_xchg(&q->pending_count, 0);
+
+				if (p) {
+					WARN_ON(p > atomic_read(&q->count));
+					atomic_sub(p, &q->count);
+				}
+			}
+		}
+
+		/*
+		 * If the most favoured prio we found pending is less
+		 * favored (or equal) than a pending IPI, we return
+		 * the IPI instead.
+		 */
+		if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
+			prio = xc->mfrr;
+			hirq = XICS_IPI;
+			break;
+		}
+
+		/* If fetching, update queue pointers */
+		if (scan_type == scan_fetch) {
+			q->idx = idx;
+			q->toggle = toggle;
+		}
+	}
+
+	/* If we are just taking a "peek", do nothing else */
+	if (scan_type == scan_poll)
+		return hirq;
+
+	/* Update the pending bits */
+	xc->pending = pending;
+
+	/*
+	 * If this is an EOI that's it, no CPPR adjustment done here,
+	 * all we needed was cleanup the stale pending bits and check
+	 * if there's anything left.
+	 */
+	if (scan_type == scan_eoi)
+		return hirq;
+
+	/*
+	 * If we found an interrupt, adjust what the guest CPPR should
+	 * be as if we had just fetched that interrupt from HW.
+	 *
+	 * Note: This can only make xc->cppr smaller as the previous
+	 * loop will only exit with hirq != 0 if prio is lower than
+	 * the current xc->cppr. Thus we don't need to re-check xc->mfrr
+	 * for pending IPIs.
+	 */
+	if (hirq)
+		xc->cppr = prio;
+	/*
+	 * If it was an IPI the HW CPPR might have been lowered too much
+	 * as the HW interrupt we use for IPIs is routed to priority 0.
+	 *
+	 * We re-sync it here.
+	 */
+	if (xc->cppr != xc->hw_cppr) {
+		xc->hw_cppr = xc->cppr;
+		__raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
+	}
+
+	return hirq;
+}
+
+static unsigned long xive_vm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+	u32 hirq;
+
+	pr_devel("H_XIRR\n");
+
+	xc->stat_vm_h_xirr++;
+
+	/* First collect pending bits from HW */
+	xive_vm_ack_pending(xc);
+
+	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
+		 xc->pending, xc->hw_cppr, xc->cppr);
+
+	/* Grab previous CPPR and reverse map it */
+	old_cppr = xive_prio_to_guest(xc->cppr);
+
+	/* Scan for actual interrupts */
+	hirq = xive_vm_scan_interrupts(xc, xc->pending, scan_fetch);
+
+	pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
+		 hirq, xc->hw_cppr, xc->cppr);
+
+	/* That should never hit */
+	if (hirq & 0xff000000)
+		pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
+
+	/*
+	 * XXX We could check if the interrupt is masked here and
+	 * filter it. If we chose to do so, we would need to do:
+	 *
+	 *    if (masked) {
+	 *        lock();
+	 *        if (masked) {
+	 *            old_Q = true;
+	 *            hirq = 0;
+	 *        }
+	 *        unlock();
+	 *    }
+	 */
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24);
+
+	return H_SUCCESS;
+}
+
+static unsigned long xive_vm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 pending = xc->pending;
+	u32 hirq;
+
+	pr_devel("H_IPOLL(server=%ld)\n", server);
+
+	xc->stat_vm_h_ipoll++;
+
+	/* Grab the target VCPU if not the current one */
+	if (xc->server_num != server) {
+		vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+		if (!vcpu)
+			return H_PARAMETER;
+		xc = vcpu->arch.xive_vcpu;
+
+		/* Scan all priorities */
+		pending = 0xff;
+	} else {
+		/* Grab pending interrupt if any */
+		__be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS);
+		u8 pipr = be64_to_cpu(qw1) & 0xff;
+
+		if (pipr < 8)
+			pending |= 1 << pipr;
+	}
+
+	hirq = xive_vm_scan_interrupts(xc, pending, scan_poll);
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24);
+
+	return H_SUCCESS;
+}
+
+static void xive_vm_push_pending_to_hw(struct kvmppc_xive_vcpu *xc)
+{
+	u8 pending, prio;
+
+	pending = xc->pending;
+	if (xc->mfrr != 0xff) {
+		if (xc->mfrr < 8)
+			pending |= 1 << xc->mfrr;
+		else
+			pending |= 0x80;
+	}
+	if (!pending)
+		return;
+	prio = ffs(pending) - 1;
+
+	__raw_writeb(prio, xive_tima + TM_SPC_SET_OS_PENDING);
+}
+
+static void xive_vm_scan_for_rerouted_irqs(struct kvmppc_xive *xive,
+					       struct kvmppc_xive_vcpu *xc)
+{
+	unsigned int prio;
+
+	/* For each priority that is now masked */
+	for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+		struct xive_q *q = &xc->queues[prio];
+		struct kvmppc_xive_irq_state *state;
+		struct kvmppc_xive_src_block *sb;
+		u32 idx, toggle, entry, irq, hw_num;
+		struct xive_irq_data *xd;
+		__be32 *qpage;
+		u16 src;
+
+		idx = q->idx;
+		toggle = q->toggle;
+		qpage = READ_ONCE(q->qpage);
+		if (!qpage)
+			continue;
+
+		/* For each interrupt in the queue */
+		for (;;) {
+			entry = be32_to_cpup(qpage + idx);
+
+			/* No more ? */
+			if ((entry >> 31) == toggle)
+				break;
+			irq = entry & 0x7fffffff;
+
+			/* Skip dummies and IPIs */
+			if (irq == XICS_DUMMY || irq == XICS_IPI)
+				goto next;
+			sb = kvmppc_xive_find_source(xive, irq, &src);
+			if (!sb)
+				goto next;
+			state = &sb->irq_state[src];
+
+			/* Has it been rerouted ? */
+			if (xc->server_num == state->act_server)
+				goto next;
+
+			/*
+			 * Allright, it *has* been re-routed, kill it from
+			 * the queue.
+			 */
+			qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY);
+
+			/* Find the HW interrupt */
+			kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+			/* If it's not an LSI, set PQ to 11 the EOI will force a resend */
+			if (!(xd->flags & XIVE_IRQ_FLAG_LSI))
+				xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
+
+			/* EOI the source */
+			xive_vm_source_eoi(hw_num, xd);
+
+next:
+			idx = (idx + 1) & q->msk;
+			if (idx == 0)
+				toggle ^= 1;
+		}
+	}
+}
+
+static int xive_vm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	u8 old_cppr;
+
+	pr_devel("H_CPPR(cppr=%ld)\n", cppr);
+
+	xc->stat_vm_h_cppr++;
+
+	/* Map CPPR */
+	cppr = xive_prio_from_guest(cppr);
+
+	/* Remember old and update SW state */
+	old_cppr = xc->cppr;
+	xc->cppr = cppr;
+
+	/*
+	 * Order the above update of xc->cppr with the subsequent
+	 * read of xc->mfrr inside push_pending_to_hw()
+	 */
+	smp_mb();
+
+	if (cppr > old_cppr) {
+		/*
+		 * We are masking less, we need to look for pending things
+		 * to deliver and set VP pending bits accordingly to trigger
+		 * a new interrupt otherwise we might miss MFRR changes for
+		 * which we have optimized out sending an IPI signal.
+		 */
+		xive_vm_push_pending_to_hw(xc);
+	} else {
+		/*
+		 * We are masking more, we need to check the queue for any
+		 * interrupt that has been routed to another CPU, take
+		 * it out (replace it with the dummy) and retrigger it.
+		 *
+		 * This is necessary since those interrupts may otherwise
+		 * never be processed, at least not until this CPU restores
+		 * its CPPR.
+		 *
+		 * This is in theory racy vs. HW adding new interrupts to
+		 * the queue. In practice this works because the interesting
+		 * cases are when the guest has done a set_xive() to move the
+		 * interrupt away, which flushes the xive, followed by the
+		 * target CPU doing a H_CPPR. So any new interrupt coming into
+		 * the queue must still be routed to us and isn't a source
+		 * of concern.
+		 */
+		xive_vm_scan_for_rerouted_irqs(xive, xc);
+	}
+
+	/* Apply new CPPR */
+	xc->hw_cppr = cppr;
+	__raw_writeb(cppr, xive_tima + TM_QW1_OS + TM_CPPR);
+
+	return H_SUCCESS;
+}
+
+static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_irq_data *xd;
+	u8 new_cppr = xirr >> 24;
+	u32 irq = xirr & 0x00ffffff, hw_num;
+	u16 src;
+	int rc = 0;
+
+	pr_devel("H_EOI(xirr=%08lx)\n", xirr);
+
+	xc->stat_vm_h_eoi++;
+
+	xc->cppr = xive_prio_from_guest(new_cppr);
+
+	/*
+	 * IPIs are synthetized from MFRR and thus don't need
+	 * any special EOI handling. The underlying interrupt
+	 * used to signal MFRR changes is EOId when fetched from
+	 * the queue.
+	 */
+	if (irq == XICS_IPI || irq == 0) {
+		/*
+		 * This barrier orders the setting of xc->cppr vs.
+		 * subsequent test of xc->mfrr done inside
+		 * scan_interrupts and push_pending_to_hw
+		 */
+		smp_mb();
+		goto bail;
+	}
+
+	/* Find interrupt source */
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb) {
+		pr_devel(" source not found !\n");
+		rc = H_PARAMETER;
+		/* Same as above */
+		smp_mb();
+		goto bail;
+	}
+	state = &sb->irq_state[src];
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	state->in_eoi = true;
+
+	/*
+	 * This barrier orders both setting of in_eoi above vs,
+	 * subsequent test of guest_priority, and the setting
+	 * of xc->cppr vs. subsequent test of xc->mfrr done inside
+	 * scan_interrupts and push_pending_to_hw
+	 */
+	smp_mb();
+
+again:
+	if (state->guest_priority == MASKED) {
+		arch_spin_lock(&sb->lock);
+		if (state->guest_priority != MASKED) {
+			arch_spin_unlock(&sb->lock);
+			goto again;
+		}
+		pr_devel(" EOI on saved P...\n");
+
+		/* Clear old_p, that will cause unmask to perform an EOI */
+		state->old_p = false;
+
+		arch_spin_unlock(&sb->lock);
+	} else {
+		pr_devel(" EOI on source...\n");
+
+		/* Perform EOI on the source */
+		xive_vm_source_eoi(hw_num, xd);
+
+		/* If it's an emulated LSI, check level and resend */
+		if (state->lsi && state->asserted)
+			__raw_writeq(0, __x_trig_page(xd));
+
+	}
+
+	/*
+	 * This barrier orders the above guest_priority check
+	 * and spin_lock/unlock with clearing in_eoi below.
+	 *
+	 * It also has to be a full mb() as it must ensure
+	 * the MMIOs done in source_eoi() are completed before
+	 * state->in_eoi is visible.
+	 */
+	mb();
+	state->in_eoi = false;
+bail:
+
+	/* Re-evaluate pending IRQs and update HW */
+	xive_vm_scan_interrupts(xc, xc->pending, scan_eoi);
+	xive_vm_push_pending_to_hw(xc);
+	pr_devel(" after scan pending=%02x\n", xc->pending);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = xc->cppr;
+	__raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
+
+	return rc;
+}
+
+static int xive_vm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			       unsigned long mfrr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
+
+	xc->stat_vm_h_ipi++;
+
+	/* Find target */
+	vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+	if (!vcpu)
+		return H_PARAMETER;
+	xc = vcpu->arch.xive_vcpu;
+
+	/* Locklessly write over MFRR */
+	xc->mfrr = mfrr;
+
+	/*
+	 * The load of xc->cppr below and the subsequent MMIO store
+	 * to the IPI must happen after the above mfrr update is
+	 * globally visible so that:
+	 *
+	 * - Synchronize with another CPU doing an H_EOI or a H_CPPR
+	 *   updating xc->cppr then reading xc->mfrr.
+	 *
+	 * - The target of the IPI sees the xc->mfrr update
+	 */
+	mb();
+
+	/* Shoot the IPI if most favored than target cppr */
+	if (mfrr < xc->cppr)
+		__raw_writeq(0, __x_trig_page(&xc->vp_ipi_data));
+
+	return H_SUCCESS;
+}
+
+/*
+ * We leave a gap of a couple of interrupts in the queue to
+ * account for the IPI and additional safety guard.
+ */
+#define XIVE_Q_GAP	2
+
+static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	/* Check enablement at VP level */
+	return xc->vp_cam & TM_QW1W2_HO;
+}
+
+bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+
+	if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE)
+		return kvmppc_xive_vcpu_has_save_restore(vcpu);
+
+	return true;
+}
+
+/*
+ * Push a vcpu's context to the XIVE on guest entry.
+ * This assumes we are in virtual mode (MMU on)
+ */
+void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
+{
+	void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
+	u64 pq;
+
+	/*
+	 * Nothing to do if the platform doesn't have a XIVE
+	 * or this vCPU doesn't have its own XIVE context
+	 * (e.g. because it's not using an in-kernel interrupt controller).
+	 */
+	if (!tima || !vcpu->arch.xive_cam_word)
+		return;
+
+	eieio();
+	if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
+		__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
+	__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
+	vcpu->arch.xive_pushed = 1;
+	eieio();
+
+	/*
+	 * We clear the irq_pending flag. There is a small chance of a
+	 * race vs. the escalation interrupt happening on another
+	 * processor setting it again, but the only consequence is to
+	 * cause a spurious wakeup on the next H_CEDE, which is not an
+	 * issue.
+	 */
+	vcpu->arch.irq_pending = 0;
+
+	/*
+	 * In single escalation mode, if the escalation interrupt is
+	 * on, we mask it.
+	 */
+	if (vcpu->arch.xive_esc_on) {
+		pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+						  XIVE_ESB_SET_PQ_01));
+		mb();
+
+		/*
+		 * We have a possible subtle race here: The escalation
+		 * interrupt might have fired and be on its way to the
+		 * host queue while we mask it, and if we unmask it
+		 * early enough (re-cede right away), there is a
+		 * theoretical possibility that it fires again, thus
+		 * landing in the target queue more than once which is
+		 * a big no-no.
+		 *
+		 * Fortunately, solving this is rather easy. If the
+		 * above load setting PQ to 01 returns a previous
+		 * value where P is set, then we know the escalation
+		 * interrupt is somewhere on its way to the host. In
+		 * that case we simply don't clear the xive_esc_on
+		 * flag below. It will be eventually cleared by the
+		 * handler for the escalation interrupt.
+		 *
+		 * Then, when doing a cede, we check that flag again
+		 * before re-enabling the escalation interrupt, and if
+		 * set, we abort the cede.
+		 */
+		if (!(pq & XIVE_ESB_VAL_P))
+			/* Now P is 0, we can clear the flag */
+			vcpu->arch.xive_esc_on = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
+
+/*
+ * Pull a vcpu's context from the XIVE on guest exit.
+ * This assumes we are in virtual mode (MMU on)
+ */
+void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
+{
+	void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
+
+	if (!vcpu->arch.xive_pushed)
+		return;
+
+	/*
+	 * Should not have been pushed if there is no tima
+	 */
+	if (WARN_ON(!tima))
+		return;
+
+	eieio();
+	/* First load to pull the context, we ignore the value */
+	__raw_readl(tima + TM_SPC_PULL_OS_CTX);
+	/* Second load to recover the context state (Words 0 and 1) */
+	if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
+		vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
+
+	/* Fixup some of the state for the next load */
+	vcpu->arch.xive_saved_state.lsmfb = 0;
+	vcpu->arch.xive_saved_state.ack = 0xff;
+	vcpu->arch.xive_pushed = 0;
+	eieio();
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
+
+bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
+{
+	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
+	bool ret = true;
+
+	if (!esc_vaddr)
+		return ret;
+
+	/* we are using XIVE with single escalation */
+
+	if (vcpu->arch.xive_esc_on) {
+		/*
+		 * If we still have a pending escalation, abort the cede,
+		 * and we must set PQ to 10 rather than 00 so that we don't
+		 * potentially end up with two entries for the escalation
+		 * interrupt in the XIVE interrupt queue.  In that case
+		 * we also don't want to set xive_esc_on to 1 here in
+		 * case we race with xive_esc_irq().
+		 */
+		ret = false;
+		/*
+		 * The escalation interrupts are special as we don't EOI them.
+		 * There is no need to use the load-after-store ordering offset
+		 * to set PQ to 10 as we won't use StoreEOI.
+		 */
+		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
+	} else {
+		vcpu->arch.xive_esc_on = true;
+		mb();
+		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
+	}
+	mb();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
+
+/*
+ * This is a simple trigger for a generic XIVE IRQ. This must
+ * only be called for interrupts that support a trigger page
+ */
+static bool xive_irq_trigger(struct xive_irq_data *xd)
+{
+	/* This should be only for MSIs */
+	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+		return false;
+
+	/* Those interrupts should always have a trigger page */
+	if (WARN_ON(!xd->trig_mmio))
+		return false;
+
+	out_be64(xd->trig_mmio, 0);
+
+	return true;
+}
+
+static irqreturn_t xive_esc_irq(int irq, void *data)
+{
+	struct kvm_vcpu *vcpu = data;
+
+	vcpu->arch.irq_pending = 1;
+	smp_mb();
+	if (vcpu->arch.ceded || vcpu->arch.nested)
+		kvmppc_fast_vcpu_kick(vcpu);
+
+	/* Since we have the no-EOI flag, the interrupt is effectively
+	 * disabled now. Clearing xive_esc_on means we won't bother
+	 * doing so on the next entry.
+	 *
+	 * This also allows the entry code to know that if a PQ combination
+	 * of 10 is observed while xive_esc_on is true, it means the queue
+	 * contains an unprocessed escalation interrupt. We don't make use of
+	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
+	 */
+	vcpu->arch.xive_esc_on = false;
+
+	/* This orders xive_esc_on = false vs. subsequent stale_p = true */
+	smp_wmb();	/* goes with smp_mb() in cleanup_single_escalation */
+
+	return IRQ_HANDLED;
+}
+
+int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
+				  bool single_escalation)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q = &xc->queues[prio];
+	char *name = NULL;
+	int rc;
+
+	/* Already there ? */
+	if (xc->esc_virq[prio])
+		return 0;
+
+	/* Hook up the escalation interrupt */
+	xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
+	if (!xc->esc_virq[prio]) {
+		pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		return -EIO;
+	}
+
+	if (single_escalation)
+		name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
+				 vcpu->kvm->arch.lpid, xc->server_num);
+	else
+		name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+				 vcpu->kvm->arch.lpid, xc->server_num, prio);
+	if (!name) {
+		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
+
+	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
+			 IRQF_NO_THREAD, name, vcpu);
+	if (rc) {
+		pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		goto error;
+	}
+	xc->esc_virq_names[prio] = name;
+
+	/* In single escalation mode, we grab the ESB MMIO of the
+	 * interrupt and mask it. Also populate the VCPU v/raddr
+	 * of the ESB page for use by asm entry/exit code. Finally
+	 * set the XIVE_IRQ_FLAG_NO_EOI flag which will prevent the
+	 * core code from performing an EOI on the escalation
+	 * interrupt, thus leaving it effectively masked after
+	 * it fires once.
+	 */
+	if (single_escalation) {
+		struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
+		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+		vcpu->arch.xive_esc_raddr = xd->eoi_page;
+		vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
+		xd->flags |= XIVE_IRQ_FLAG_NO_EOI;
+	}
+
+	return 0;
+error:
+	irq_dispose_mapping(xc->esc_virq[prio]);
+	xc->esc_virq[prio] = 0;
+	kfree(name);
+	return rc;
+}
+
+static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	struct xive_q *q =  &xc->queues[prio];
+	void *qpage;
+	int rc;
+
+	if (WARN_ON(q->qpage))
+		return 0;
+
+	/* Allocate the queue and retrieve infos on current node for now */
+	qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order);
+	if (!qpage) {
+		pr_err("Failed to allocate queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+		return -ENOMEM;
+	}
+	memset(qpage, 0, 1 << xive->q_order);
+
+	/*
+	 * Reconfigure the queue. This will set q->qpage only once the
+	 * queue is fully configured. This is a requirement for prio 0
+	 * as we will stop doing EOIs for every IPI as soon as we observe
+	 * qpage being non-NULL, and instead will only EOI when we receive
+	 * corresponding queue 0 entries
+	 */
+	rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
+					 xive->q_order, true);
+	if (rc)
+		pr_err("Failed to configure queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+	return rc;
+}
+
+/* Called with xive->lock held */
+static int xive_check_provisioning(struct kvm *kvm, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+	int rc;
+
+	lockdep_assert_held(&xive->lock);
+
+	/* Already provisioned ? */
+	if (xive->qmap & (1 << prio))
+		return 0;
+
+	pr_devel("Provisioning prio... %d\n", prio);
+
+	/* Provision each VCPU and enable escalations if needed */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_provision_queue(vcpu, prio);
+		if (rc == 0 && !kvmppc_xive_has_single_escalation(xive))
+			kvmppc_xive_attach_escalation(vcpu, prio,
+						      kvmppc_xive_has_single_escalation(xive));
+		if (rc)
+			return rc;
+	}
+
+	/* Order previous stores and mark it as provisioned */
+	mb();
+	xive->qmap |= (1 << prio);
+	return 0;
+}
+
+static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_xive_vcpu *xc;
+	struct xive_q *q;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, server);
+	if (!vcpu) {
+		pr_warn("%s: Can't find server %d\n", __func__, server);
+		return;
+	}
+	xc = vcpu->arch.xive_vcpu;
+	if (WARN_ON(!xc))
+		return;
+
+	q = &xc->queues[prio];
+	atomic_inc(&q->pending_count);
+}
+
+static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q;
+	u32 max;
+
+	if (WARN_ON(!xc))
+		return -ENXIO;
+	if (!xc->valid)
+		return -ENXIO;
+
+	q = &xc->queues[prio];
+	if (WARN_ON(!q->qpage))
+		return -ENXIO;
+
+	/* Calculate max number of interrupts in that queue. */
+	max = (q->msk + 1) - XIVE_Q_GAP;
+	return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
+}
+
+int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+	int rc;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, *server);
+	if (!vcpu) {
+		pr_devel("Can't find server %d\n", *server);
+		return -EINVAL;
+	}
+
+	pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio);
+
+	/* Try pick it */
+	rc = xive_try_pick_queue(vcpu, prio);
+	if (rc == 0)
+		return rc;
+
+	pr_devel(" .. failed, looking up candidate...\n");
+
+	/* Failed, pick another VCPU */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_try_pick_queue(vcpu, prio);
+		if (rc == 0) {
+			*server = vcpu->arch.xive_vcpu->server_num;
+			pr_devel("  found on 0x%x/%d\n", *server, prio);
+			return rc;
+		}
+	}
+	pr_devel("  no available target !\n");
+
+	/* No available target ! */
+	return -EBUSY;
+}
+
+static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
+			     struct kvmppc_xive_src_block *sb,
+			     struct kvmppc_xive_irq_state *state)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u8 old_prio;
+	u64 val;
+
+	/*
+	 * Take the lock, set masked, try again if racing
+	 * with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		old_prio = state->guest_priority;
+		state->guest_priority = MASKED;
+		mb();
+		if (!state->in_eoi)
+			break;
+		state->guest_priority = old_prio;
+		arch_spin_unlock(&sb->lock);
+	}
+
+	/* No change ? Bail */
+	if (old_prio == MASKED)
+		return old_prio;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/* Set PQ to 10, return old P and old Q and remember them */
+	val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
+	state->old_p = !!(val & 2);
+	state->old_q = !!(val & 1);
+
+	/*
+	 * Synchronize hardware to sensure the queues are updated when
+	 * masking
+	 */
+	xive_native_sync_source(hw_num);
+
+	return old_prio;
+}
+
+static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
+				 struct kvmppc_xive_irq_state *state)
+{
+	/*
+	 * Take the lock try again if racing with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		if (!state->in_eoi)
+			break;
+		arch_spin_unlock(&sb->lock);
+	}
+}
+
+static void xive_finish_unmask(struct kvmppc_xive *xive,
+			       struct kvmppc_xive_src_block *sb,
+			       struct kvmppc_xive_irq_state *state,
+			       u8 prio)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+
+	/* If we aren't changing a thing, move on */
+	if (state->guest_priority != MASKED)
+		goto bail;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/* Old Q set, set PQ to 11 */
+	if (state->old_q)
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
+
+	/*
+	 * If not old P, then perform an "effective" EOI,
+	 * on the source. This will handle the cases where
+	 * FW EOI is needed.
+	 */
+	if (!state->old_p)
+		xive_vm_source_eoi(hw_num, xd);
+
+	/* Synchronize ordering and mark unmasked */
+	mb();
+bail:
+	state->guest_priority = prio;
+}
+
+/*
+ * Target an interrupt to a given server/prio, this will fallback
+ * to another server if necessary and perform the HW targetting
+ * updates as needed
+ *
+ * NOTE: Must be called with the state lock held
+ */
+static int xive_target_interrupt(struct kvm *kvm,
+				 struct kvmppc_xive_irq_state *state,
+				 u32 server, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	u32 hw_num;
+	int rc;
+
+	/*
+	 * This will return a tentative server and actual
+	 * priority. The count for that new target will have
+	 * already been incremented.
+	 */
+	rc = kvmppc_xive_select_target(kvm, &server, prio);
+
+	/*
+	 * We failed to find a target ? Not much we can do
+	 * at least until we support the GIQ.
+	 */
+	if (rc)
+		return rc;
+
+	/*
+	 * Increment the old queue pending count if there
+	 * was one so that the old queue count gets adjusted later
+	 * when observed to be empty.
+	 */
+	if (state->act_priority != MASKED)
+		xive_inc_q_pending(kvm,
+				   state->act_server,
+				   state->act_priority);
+	/*
+	 * Update state and HW
+	 */
+	state->act_priority = prio;
+	state->act_server = server;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, NULL);
+
+	return xive_native_configure_irq(hw_num,
+					 kvmppc_xive_vp(xive, server),
+					 prio, state->number);
+}
+
+/*
+ * Targetting rules: In order to avoid losing track of
+ * pending interrupts across mask and unmask, which would
+ * allow queue overflows, we implement the following rules:
+ *
+ *  - Unless it was never enabled (or we run out of capacity)
+ *    an interrupt is always targetted at a valid server/queue
+ *    pair even when "masked" by the guest. This pair tends to
+ *    be the last one used but it can be changed under some
+ *    circumstances. That allows us to separate targetting
+ *    from masking, we only handle accounting during (re)targetting,
+ *    this also allows us to let an interrupt drain into its target
+ *    queue after masking, avoiding complex schemes to remove
+ *    interrupts out of remote processor queues.
+ *
+ *  - When masking, we set PQ to 10 and save the previous value
+ *    of P and Q.
+ *
+ *  - When unmasking, if saved Q was set, we set PQ to 11
+ *    otherwise we leave PQ to the HW state which will be either
+ *    10 if nothing happened or 11 if the interrupt fired while
+ *    masked. Effectively we are OR'ing the previous Q into the
+ *    HW Q.
+ *
+ *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
+ *    which will unmask the interrupt and shoot a new one if Q was
+ *    set.
+ *
+ *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
+ *    effectively meaning an H_EOI from the guest is still expected
+ *    for that interrupt).
+ *
+ *  - If H_EOI occurs while masked, we clear the saved P.
+ *
+ *  - When changing target, we account on the new target and
+ *    increment a separate "pending" counter on the old one.
+ *    This pending counter will be used to decrement the old
+ *    target's count when its queue has been observed empty.
+ */
+
+int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+			 u32 priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u8 new_act_prio;
+	int rc = 0;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n",
+		 irq, server, priority);
+
+	/* First, check provisioning of queues */
+	if (priority != MASKED) {
+		mutex_lock(&xive->lock);
+		rc = xive_check_provisioning(xive->kvm,
+			      xive_prio_from_guest(priority));
+		mutex_unlock(&xive->lock);
+	}
+	if (rc) {
+		pr_devel("  provisioning failure %d !\n", rc);
+		return rc;
+	}
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * We first handle masking/unmasking since the locking
+	 * might need to be retried due to EOIs, we'll handle
+	 * targetting changes later. These functions will return
+	 * with the SB lock held.
+	 *
+	 * xive_lock_and_mask() will also set state->guest_priority
+	 * but won't otherwise change other fields of the state.
+	 *
+	 * xive_lock_for_unmask will not actually unmask, this will
+	 * be done later by xive_finish_unmask() once the targetting
+	 * has been done, so we don't try to unmask an interrupt
+	 * that hasn't yet been targetted.
+	 */
+	if (priority == MASKED)
+		xive_lock_and_mask(xive, sb, state);
+	else
+		xive_lock_for_unmask(sb, state);
+
+
+	/*
+	 * Then we handle targetting.
+	 *
+	 * First calculate a new "actual priority"
+	 */
+	new_act_prio = state->act_priority;
+	if (priority != MASKED)
+		new_act_prio = xive_prio_from_guest(priority);
+
+	pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n",
+		 new_act_prio, state->act_server, state->act_priority);
+
+	/*
+	 * Then check if we actually need to change anything,
+	 *
+	 * The condition for re-targetting the interrupt is that
+	 * we have a valid new priority (new_act_prio is not 0xff)
+	 * and either the server or the priority changed.
+	 *
+	 * Note: If act_priority was ff and the new priority is
+	 *       also ff, we don't do anything and leave the interrupt
+	 *       untargetted. An attempt of doing an int_on on an
+	 *       untargetted interrupt will fail. If that is a problem
+	 *       we could initialize interrupts with valid default
+	 */
+
+	if (new_act_prio != MASKED &&
+	    (state->act_server != server ||
+	     state->act_priority != new_act_prio))
+		rc = xive_target_interrupt(kvm, state, server, new_act_prio);
+
+	/*
+	 * Perform the final unmasking of the interrupt source
+	 * if necessary
+	 */
+	if (priority != MASKED)
+		xive_finish_unmask(xive, sb, state, priority);
+
+	/*
+	 * Finally Update saved_priority to match. Only int_on/off
+	 * set this field to a different value.
+	 */
+	state->saved_priority = priority;
+
+	arch_spin_unlock(&sb->lock);
+	return rc;
+}
+
+int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+			 u32 *priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+	arch_spin_lock(&sb->lock);
+	*server = state->act_server;
+	*priority = state->guest_priority;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_on(irq=0x%x)\n", irq);
+
+	/*
+	 * Check if interrupt was not targetted
+	 */
+	if (state->act_priority == MASKED) {
+		pr_devel("int_on on untargetted interrupt\n");
+		return -EINVAL;
+	}
+
+	/* If saved_priority is 0xff, do nothing */
+	if (state->saved_priority == MASKED)
+		return 0;
+
+	/*
+	 * Lock and unmask it.
+	 */
+	xive_lock_for_unmask(sb, state);
+	xive_finish_unmask(xive, sb, state, state->saved_priority);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_off(irq=0x%x)\n", irq);
+
+	/*
+	 * Lock and mask
+	 */
+	state->saved_priority = xive_lock_and_mask(xive, sb, state);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return false;
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return false;
+
+	/*
+	 * Trigger the IPI. This assumes we never restore a pass-through
+	 * interrupt which should be safe enough
+	 */
+	xive_irq_trigger(&state->ipi_data);
+
+	return true;
+}
+
+u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	if (!xc)
+		return 0;
+
+	/* Return the per-cpu state for state saving/migration */
+	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
+	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
+	       (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
+}
+
+int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	u8 cppr, mfrr;
+	u32 xisr;
+
+	if (!xc || !xive)
+		return -ENOENT;
+
+	/* Grab individual state fields. We don't use pending_pri */
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+
+	pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
+		 xc->server_num, cppr, mfrr, xisr);
+
+	/*
+	 * We can't update the state of a "pushed" VCPU, but that
+	 * shouldn't happen because the vcpu->mutex makes running a
+	 * vcpu mutually exclusive with doing one_reg get/set on it.
+	 */
+	if (WARN_ON(vcpu->arch.xive_pushed))
+		return -EIO;
+
+	/* Update VCPU HW saved state */
+	vcpu->arch.xive_saved_state.cppr = cppr;
+	xc->hw_cppr = xc->cppr = cppr;
+
+	/*
+	 * Update MFRR state. If it's not 0xff, we mark the VCPU as
+	 * having a pending MFRR change, which will re-evaluate the
+	 * target. The VCPU will thus potentially get a spurious
+	 * interrupt but that's not a big deal.
+	 */
+	xc->mfrr = mfrr;
+	if (mfrr < cppr)
+		xive_irq_trigger(&xc->vp_ipi_data);
+
+	/*
+	 * Now saved XIRR is "interesting". It means there's something in
+	 * the legacy "1 element" queue... for an IPI we simply ignore it,
+	 * as the MFRR restore will handle that. For anything else we need
+	 * to force a resend of the source.
+	 * However the source may not have been setup yet. If that's the
+	 * case, we keep that info and increment a counter in the xive to
+	 * tell subsequent xive_set_source() to go look.
+	 */
+	if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
+		xc->delayed_irq = xisr;
+		xive->delayed_irqs++;
+		pr_devel("  xisr restore delayed\n");
+	}
+
+	return 0;
+}
+
+int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   unsigned long host_irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct irq_data *host_data =
+		irq_domain_get_irq_data(irq_get_default_host(), host_irq);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n",
+		 __func__, guest_irq, host_irq, hw_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mark the passed-through interrupt as going to a VCPU,
+	 * this will prevent further EOIs and similar operations
+	 * from the XIVE code. It will also mask the interrupt
+	 * to either PQ=10 or 11 state, the latter if the interrupt
+	 * is pending. This will allow us to unmask or retrigger it
+	 * after routing it to the guest with a simple EOI.
+	 *
+	 * The "state" argument is a "token", all it needs is to be
+	 * non-NULL to switch to passed-through or NULL for the
+	 * other way around. We may not yet have an actual VCPU
+	 * target here and we don't really care.
+	 */
+	rc = irq_set_vcpu_affinity(host_irq, state);
+	if (rc) {
+		pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq);
+		return rc;
+	}
+
+	/*
+	 * Mask and read state of IPI. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/* Turn the IPI hard off */
+	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/*
+	 * Reset ESB guest mapping. Needed when ESB pages are exposed
+	 * to the guest in XIVE native mode
+	 */
+	if (xive->ops && xive->ops->reset_mapped)
+		xive->ops->reset_mapped(kvm, guest_irq);
+
+	/* Grab info about irq */
+	state->pt_number = hw_irq;
+	state->pt_data = irq_data_get_irq_handler_data(host_data);
+
+	/*
+	 * Configure the IRQ to match the existing configuration of
+	 * the IPI if it was already targetted. Otherwise this will
+	 * mask the interrupt in a lossy way (act_priority is 0xff)
+	 * which is fine for a never started interrupt.
+	 */
+	xive_native_configure_irq(hw_irq,
+				  kvmppc_xive_vp(xive, state->act_server),
+				  state->act_priority, state->number);
+
+	/*
+	 * We do an EOI to enable the interrupt (and retrigger if needed)
+	 * if the guest has the interrupt unmasked and the P bit was *not*
+	 * set in the IPI. If it was set, we know a slot may still be in
+	 * use in the target queue thus we have to wait for a guest
+	 * originated EOI
+	 */
+	if (prio != MASKED && !state->old_p)
+		xive_vm_source_eoi(hw_irq, state->pt_data);
+
+	/* Clear old_p/old_q as they are no longer relevant */
+	state->old_p = state->old_q = false;
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
+
+int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   unsigned long host_irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mask and read state of IRQ. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/*
+	 * If old_p is set, the interrupt is pending, we switch it to
+	 * PQ=11. This will force a resend in the host so the interrupt
+	 * isn't lost to whatever host driver may pick it up
+	 */
+	if (state->old_p)
+		xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
+
+	/* Release the passed-through interrupt to the host */
+	rc = irq_set_vcpu_affinity(host_irq, NULL);
+	if (rc) {
+		pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq);
+		return rc;
+	}
+
+	/* Forget about the IRQ */
+	state->pt_number = 0;
+	state->pt_data = NULL;
+
+	/*
+	 * Reset ESB guest mapping. Needed when ESB pages are exposed
+	 * to the guest in XIVE native mode
+	 */
+	if (xive->ops && xive->ops->reset_mapped) {
+		xive->ops->reset_mapped(kvm, guest_irq);
+	}
+
+	/* Reconfigure the IPI */
+	xive_native_configure_irq(state->ipi_number,
+				  kvmppc_xive_vp(xive, state->act_server),
+				  state->act_priority, state->number);
+
+	/*
+	 * If old_p is set (we have a queue entry potentially
+	 * occupied) or the interrupt is masked, we set the IPI
+	 * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
+	 */
+	if (prio == MASKED || state->old_p)
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
+	else
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
+
+void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	int i, j;
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (!sb)
+			continue;
+		for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+			struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+
+			if (!state->valid)
+				continue;
+			if (state->act_priority == MASKED)
+				continue;
+			if (state->act_server != xc->server_num)
+				continue;
+
+			/* Clean it up */
+			arch_spin_lock(&sb->lock);
+			state->act_priority = MASKED;
+			xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+			xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+			if (state->pt_number) {
+				xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
+				xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
+			}
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+
+	/* Disable vcpu's escalation interrupt */
+	if (vcpu->arch.xive_esc_on) {
+		__raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+					     XIVE_ESB_SET_PQ_01));
+		vcpu->arch.xive_esc_on = false;
+	}
+
+	/*
+	 * Clear pointers to escalation interrupt ESB.
+	 * This is safe because the vcpu->mutex is held, preventing
+	 * any other CPU from concurrently executing a KVM_RUN ioctl.
+	 */
+	vcpu->arch.xive_esc_vaddr = 0;
+	vcpu->arch.xive_esc_raddr = 0;
+}
+
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with.  However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it).  Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int irq)
+{
+	struct irq_data *d = irq_get_irq_data(irq);
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	/*
+	 * This slightly odd sequence gives the right result
+	 * (i.e. stale_p set if xive_esc_on is false) even if
+	 * we race with xive_esc_irq() and xive_irq_eoi().
+	 */
+	xd->stale_p = false;
+	smp_mb();		/* paired with smb_wmb in xive_esc_irq */
+	if (!vcpu->arch.xive_esc_on)
+		xd->stale_p = true;
+}
+
+void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	int i;
+
+	if (!kvmppc_xics_enabled(vcpu))
+		return;
+
+	if (!xc)
+		return;
+
+	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
+
+	/* Ensure no interrupt is still routed to that VP */
+	xc->valid = false;
+	kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+	/* Mask the VP IPI */
+	xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Free escalations */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		if (xc->esc_virq[i]) {
+			if (kvmppc_xive_has_single_escalation(xc->xive))
+				xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]);
+			free_irq(xc->esc_virq[i], vcpu);
+			irq_dispose_mapping(xc->esc_virq[i]);
+			kfree(xc->esc_virq_names[i]);
+		}
+	}
+
+	/* Disable the VP */
+	xive_native_disable_vp(xc->vp_id);
+
+	/* Clear the cam word so guest entry won't try to push context */
+	vcpu->arch.xive_cam_word = 0;
+
+	/* Free the queues */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		xive_native_disable_queue(xc->vp_id, q, i);
+		if (q->qpage) {
+			free_pages((unsigned long)q->qpage,
+				   xive->q_page_order);
+			q->qpage = NULL;
+		}
+	}
+
+	/* Free the IPI */
+	if (xc->vp_ipi) {
+		xive_cleanup_irq_data(&xc->vp_ipi_data);
+		xive_native_free_irq(xc->vp_ipi);
+	}
+	/* Free the VP */
+	kfree(xc);
+
+	/* Cleanup the vcpu */
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+	vcpu->arch.xive_vcpu = NULL;
+}
+
+static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
+{
+	/* We have a block of xive->nr_servers VPs. We just need to check
+	 * packed vCPU ids are below that.
+	 */
+	return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers;
+}
+
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
+{
+	u32 vp_id;
+
+	if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) {
+		pr_devel("Out of bounds !\n");
+		return -EINVAL;
+	}
+
+	if (xive->vp_base == XIVE_INVALID_VP) {
+		xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers);
+		pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers);
+
+		if (xive->vp_base == XIVE_INVALID_VP)
+			return -ENOSPC;
+	}
+
+	vp_id = kvmppc_xive_vp(xive, cpu);
+	if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
+		pr_devel("Duplicate !\n");
+		return -EEXIST;
+	}
+
+	*vp = vp_id;
+
+	return 0;
+}
+
+int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+			     struct kvm_vcpu *vcpu, u32 cpu)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvmppc_xive_vcpu *xc;
+	int i, r = -EBUSY;
+	u32 vp_id;
+
+	pr_devel("connect_vcpu(cpu=%d)\n", cpu);
+
+	if (dev->ops != &kvm_xive_ops) {
+		pr_devel("Wrong ops !\n");
+		return -EPERM;
+	}
+	if (xive->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
+		return -EBUSY;
+
+	/* We need to synchronize with queue provisioning */
+	mutex_lock(&xive->lock);
+
+	r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id);
+	if (r)
+		goto bail;
+
+	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+	if (!xc) {
+		r = -ENOMEM;
+		goto bail;
+	}
+
+	vcpu->arch.xive_vcpu = xc;
+	xc->xive = xive;
+	xc->vcpu = vcpu;
+	xc->server_num = cpu;
+	xc->vp_id = vp_id;
+	xc->mfrr = 0xff;
+	xc->valid = true;
+
+	r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
+	if (r)
+		goto bail;
+
+	if (!kvmppc_xive_check_save_restore(vcpu)) {
+		pr_err("inconsistent save-restore setup for VCPU %d\n", cpu);
+		r = -EIO;
+		goto bail;
+	}
+
+	/* Configure VCPU fields for use by assembly push/pull */
+	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
+	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
+
+	/* Allocate IPI */
+	xc->vp_ipi = xive_native_alloc_irq();
+	if (!xc->vp_ipi) {
+		pr_err("Failed to allocate xive irq for VCPU IPI\n");
+		r = -EIO;
+		goto bail;
+	}
+	pr_devel(" IPI=0x%x\n", xc->vp_ipi);
+
+	r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
+	if (r)
+		goto bail;
+
+	/*
+	 * Enable the VP first as the single escalation mode will
+	 * affect escalation interrupts numbering
+	 */
+	r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
+	if (r) {
+		pr_err("Failed to enable VP in OPAL, err %d\n", r);
+		goto bail;
+	}
+
+	/*
+	 * Initialize queues. Initially we set them all for no queueing
+	 * and we enable escalation for queue 0 only which we'll use for
+	 * our mfrr change notifications. If the VCPU is hot-plugged, we
+	 * do handle provisioning however based on the existing "map"
+	 * of enabled queues.
+	 */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Single escalation, no queue 7 */
+		if (i == 7 && kvmppc_xive_has_single_escalation(xive))
+			break;
+
+		/* Is queue already enabled ? Provision it */
+		if (xive->qmap & (1 << i)) {
+			r = xive_provision_queue(vcpu, i);
+			if (r == 0 && !kvmppc_xive_has_single_escalation(xive))
+				kvmppc_xive_attach_escalation(
+					vcpu, i, kvmppc_xive_has_single_escalation(xive));
+			if (r)
+				goto bail;
+		} else {
+			r = xive_native_configure_queue(xc->vp_id,
+							q, i, NULL, 0, true);
+			if (r) {
+				pr_err("Failed to configure queue %d for VCPU %d\n",
+				       i, cpu);
+				goto bail;
+			}
+		}
+	}
+
+	/* If not done above, attach priority 0 escalation */
+	r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive));
+	if (r)
+		goto bail;
+
+	/* Route the IPI */
+	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
+	if (!r)
+		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
+
+bail:
+	mutex_unlock(&xive->lock);
+	if (r) {
+		kvmppc_xive_cleanup_vcpu(vcpu);
+		return r;
+	}
+
+	vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+	return 0;
+}
+
+/*
+ * Scanning of queues before/after migration save
+ */
+static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return;
+
+	state = &sb->irq_state[idx];
+
+	/* Some sanity checking */
+	if (!state->valid) {
+		pr_err("invalid irq 0x%x in cpu queue!\n", irq);
+		return;
+	}
+
+	/*
+	 * If the interrupt is in a queue it should have P set.
+	 * We warn so that gets reported. A backtrace isn't useful
+	 * so no need to use a WARN_ON.
+	 */
+	if (!state->saved_p)
+		pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
+
+	/* Set flag */
+	state->in_queue = true;
+}
+
+static void xive_pre_save_mask_irq(struct kvmppc_xive *xive,
+				   struct kvmppc_xive_src_block *sb,
+				   u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/* Mask and save state, this will also sync HW queues */
+	state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
+
+	/* Transfer P and Q */
+	state->saved_p = state->old_p;
+	state->saved_q = state->old_q;
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive,
+				     struct kvmppc_xive_src_block *sb,
+				     u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/*
+	 * Lock / exclude EOI (not technically necessary if the
+	 * guest isn't running concurrently. If this becomes a
+	 * performance issue we can probably remove the lock.
+	 */
+	xive_lock_for_unmask(sb, state);
+
+	/* Restore mask/prio if it wasn't masked */
+	if (state->saved_scan_prio != MASKED)
+		xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
+{
+	u32 idx = q->idx;
+	u32 toggle = q->toggle;
+	u32 irq;
+
+	do {
+		irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
+		if (irq > XICS_IPI)
+			xive_pre_save_set_queued(xive, irq);
+	} while(irq);
+}
+
+static void xive_pre_save_scan(struct kvmppc_xive *xive)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long i;
+	int j;
+
+	/*
+	 * See comment in xive_get_source() about how this
+	 * work. Collect a stable state for all interrupts
+	 */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_mask_irq(xive, sb, j);
+	}
+
+	/* Then scan the queues and update the "in_queue" flag */
+	kvm_for_each_vcpu(i, vcpu, xive->kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		if (!xc)
+			continue;
+		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
+			if (xc->queues[j].qpage)
+				xive_pre_save_queue(xive, &xc->queues[j]);
+		}
+	}
+
+	/* Finally restore interrupt states */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_unmask_irq(xive, sb, j);
+	}
+}
+
+static void xive_post_save_scan(struct kvmppc_xive *xive)
+{
+	u32 i, j;
+
+	/* Clear all the in_queue flags */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			sb->irq_state[j].in_queue = false;
+	}
+
+	/* Next get_source() will do a new scan */
+	xive->saved_src_count = 0;
+}
+
+/*
+ * This returns the source configuration and state to user space.
+ */
+static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u64 val, prio;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -ENOENT;
+
+	state = &sb->irq_state[idx];
+
+	if (!state->valid)
+		return -ENOENT;
+
+	pr_devel("get_source(%ld)...\n", irq);
+
+	/*
+	 * So to properly save the state into something that looks like a
+	 * XICS migration stream we cannot treat interrupts individually.
+	 *
+	 * We need, instead, mask them all (& save their previous PQ state)
+	 * to get a stable state in the HW, then sync them to ensure that
+	 * any interrupt that had already fired hits its queue, and finally
+	 * scan all the queues to collect which interrupts are still present
+	 * in the queues, so we can set the "pending" flag on them and
+	 * they can be resent on restore.
+	 *
+	 * So we do it all when the "first" interrupt gets saved, all the
+	 * state is collected at that point, the rest of xive_get_source()
+	 * will merely collect and convert that state to the expected
+	 * userspace bit mask.
+	 */
+	if (xive->saved_src_count == 0)
+		xive_pre_save_scan(xive);
+	xive->saved_src_count++;
+
+	/* Convert saved state into something compatible with xics */
+	val = state->act_server;
+	prio = state->saved_scan_prio;
+
+	if (prio == MASKED) {
+		val |= KVM_XICS_MASKED;
+		prio = state->saved_priority;
+	}
+	val |= prio << KVM_XICS_PRIORITY_SHIFT;
+	if (state->lsi) {
+		val |= KVM_XICS_LEVEL_SENSITIVE;
+		if (state->saved_p)
+			val |= KVM_XICS_PENDING;
+	} else {
+		if (state->saved_p)
+			val |= KVM_XICS_PRESENTED;
+
+		if (state->saved_q)
+			val |= KVM_XICS_QUEUED;
+
+		/*
+		 * We mark it pending (which will attempt a re-delivery)
+		 * if we are in a queue *or* we were masked and had
+		 * Q set which is equivalent to the XICS "masked pending"
+		 * state
+		 */
+		if (state->in_queue || (prio == MASKED && state->saved_q))
+			val |= KVM_XICS_PENDING;
+	}
+
+	/*
+	 * If that was the last interrupt saved, reset the
+	 * in_queue flags
+	 */
+	if (xive->saved_src_count == xive->src_count)
+		xive_post_save_scan(xive);
+
+	/* Copy the result to userspace */
+	if (put_user(val, ubufp))
+		return -EFAULT;
+
+	return 0;
+}
+
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+	struct kvmppc_xive *xive, int irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	int i, bid;
+
+	bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&xive->lock);
+
+	/* block already exists - somebody else got here first */
+	if (xive->src_blocks[bid])
+		goto out;
+
+	/* Create the ICS */
+	sb = kzalloc(sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto out;
+
+	sb->id = bid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
+		sb->irq_state[i].eisn = 0;
+		sb->irq_state[i].guest_priority = MASKED;
+		sb->irq_state[i].saved_priority = MASKED;
+		sb->irq_state[i].act_priority = MASKED;
+	}
+	smp_wmb();
+	xive->src_blocks[bid] = sb;
+
+	if (bid > xive->max_sbid)
+		xive->max_sbid = bid;
+
+out:
+	mutex_unlock(&xive->lock);
+	return xive->src_blocks[bid];
+}
+
+static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		if (xc->delayed_irq == irq) {
+			xc->delayed_irq = 0;
+			xive->delayed_irqs--;
+			return true;
+		}
+	}
+	return false;
+}
+
+static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 act_prio, guest_prio;
+	u32 server;
+	int rc = 0;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	pr_devel("set_source(irq=0x%lx)\n", irq);
+
+	/* Find the source */
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb) {
+		pr_devel("No source, creating source block...\n");
+		sb = kvmppc_xive_create_src_block(xive, irq);
+		if (!sb) {
+			pr_devel("Failed to create block...\n");
+			return -ENOMEM;
+		}
+	}
+	state = &sb->irq_state[idx];
+
+	/* Read user passed data */
+	if (get_user(val, ubufp)) {
+		pr_devel("fault getting user info !\n");
+		return -EFAULT;
+	}
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
+
+	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
+		 val, server, guest_prio);
+
+	/*
+	 * If the source doesn't already have an IPI, allocate
+	 * one and get the corresponding data
+	 */
+	if (!state->ipi_number) {
+		state->ipi_number = xive_native_alloc_irq();
+		if (state->ipi_number == 0) {
+			pr_devel("Failed to allocate IPI !\n");
+			return -ENOMEM;
+		}
+		xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
+		pr_devel(" src_ipi=0x%x\n", state->ipi_number);
+	}
+
+	/*
+	 * We use lock_and_mask() to set us in the right masked
+	 * state. We will override that state from the saved state
+	 * further down, but this will handle the cases of interrupts
+	 * that need FW masking. We set the initial guest_priority to
+	 * 0 before calling it to ensure it actually performs the masking.
+	 */
+	state->guest_priority = 0;
+	xive_lock_and_mask(xive, sb, state);
+
+	/*
+	 * Now, we select a target if we have one. If we don't we
+	 * leave the interrupt untargetted. It means that an interrupt
+	 * can become "untargetted" across migration if it was masked
+	 * by set_xive() but there is little we can do about it.
+	 */
+
+	/* First convert prio and mark interrupt as untargetted */
+	act_prio = xive_prio_from_guest(guest_prio);
+	state->act_priority = MASKED;
+
+	/*
+	 * We need to drop the lock due to the mutex below. Hopefully
+	 * nothing is touching that interrupt yet since it hasn't been
+	 * advertized to a running guest yet
+	 */
+	arch_spin_unlock(&sb->lock);
+
+	/* If we have a priority target the interrupt */
+	if (act_prio != MASKED) {
+		/* First, check provisioning of queues */
+		mutex_lock(&xive->lock);
+		rc = xive_check_provisioning(xive->kvm, act_prio);
+		mutex_unlock(&xive->lock);
+
+		/* Target interrupt */
+		if (rc == 0)
+			rc = xive_target_interrupt(xive->kvm, state,
+						   server, act_prio);
+		/*
+		 * If provisioning or targetting failed, leave it
+		 * alone and masked. It will remain disabled until
+		 * the guest re-targets it.
+		 */
+	}
+
+	/*
+	 * Find out if this was a delayed irq stashed in an ICP,
+	 * in which case, treat it as pending
+	 */
+	if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
+		val |= KVM_XICS_PENDING;
+		pr_devel("  Found delayed ! forcing PENDING !\n");
+	}
+
+	/* Cleanup the SW state */
+	state->old_p = false;
+	state->old_q = false;
+	state->lsi = false;
+	state->asserted = false;
+
+	/* Restore LSI state */
+	if (val & KVM_XICS_LEVEL_SENSITIVE) {
+		state->lsi = true;
+		if (val & KVM_XICS_PENDING)
+			state->asserted = true;
+		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
+	}
+
+	/*
+	 * Restore P and Q. If the interrupt was pending, we
+	 * force Q and !P, which will trigger a resend.
+	 *
+	 * That means that a guest that had both an interrupt
+	 * pending (queued) and Q set will restore with only
+	 * one instance of that interrupt instead of 2, but that
+	 * is perfectly fine as coalescing interrupts that haven't
+	 * been presented yet is always allowed.
+	 */
+	if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
+		state->old_p = true;
+	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
+		state->old_q = true;
+
+	pr_devel("  P=%d, Q=%d\n", state->old_p, state->old_q);
+
+	/*
+	 * If the interrupt was unmasked, update guest priority and
+	 * perform the appropriate state transition and do a
+	 * re-trigger if necessary.
+	 */
+	if (val & KVM_XICS_MASKED) {
+		pr_devel("  masked, saving prio\n");
+		state->guest_priority = MASKED;
+		state->saved_priority = guest_prio;
+	} else {
+		pr_devel("  unmasked, restoring to prio %d\n", guest_prio);
+		xive_finish_unmask(xive, sb, state, guest_prio);
+		state->saved_priority = guest_prio;
+	}
+
+	/* Increment the number of valid sources and mark this one valid */
+	if (!state->valid)
+		xive->src_count++;
+	state->valid = true;
+
+	return 0;
+}
+
+int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+
+	/* Perform locklessly .... (we need to do some RCUisms here...) */
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return -EINVAL;
+
+	/* We don't allow a trigger on a passed-through interrupt */
+	if (state->pt_number)
+		return -EINVAL;
+
+	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = true;
+	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
+		state->asserted = false;
+		return 0;
+	}
+
+	/* Trigger the IPI */
+	xive_irq_trigger(&state->ipi_data);
+
+	return 0;
+}
+
+int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr)
+{
+	u32 __user *ubufp = (u32 __user *) addr;
+	u32 nr_servers;
+	int rc = 0;
+
+	if (get_user(nr_servers, ubufp))
+		return -EFAULT;
+
+	pr_devel("%s nr_servers=%u\n", __func__, nr_servers);
+
+	if (!nr_servers || nr_servers > KVM_MAX_VCPU_IDS)
+		return -EINVAL;
+
+	mutex_lock(&xive->lock);
+	if (xive->vp_base != XIVE_INVALID_VP)
+		/* The VP block is allocated once and freed when the device
+		 * is released. Better not allow to change its size since its
+		 * used by connect_vcpu to validate vCPU ids are valid (eg,
+		 * setting it back to a higher value could allow connect_vcpu
+		 * to come up with a VP id that goes beyond the VP block, which
+		 * is likely to cause a crash in OPAL).
+		 */
+		rc = -EBUSY;
+	else if (nr_servers > KVM_MAX_VCPUS)
+		/* We don't need more servers. Higher vCPU ids get packed
+		 * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id().
+		 */
+		xive->nr_servers = KVM_MAX_VCPUS;
+	else
+		xive->nr_servers = nr_servers;
+
+	mutex_unlock(&xive->lock);
+
+	return rc;
+}
+
+static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_set_source(xive, attr->attr, attr->addr);
+	case KVM_DEV_XICS_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_XICS_NR_SERVERS:
+			return kvmppc_xive_set_nr_servers(xive, attr->addr);
+		}
+	}
+	return -ENXIO;
+}
+
+static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_get_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	/* We honor the same limits as XICS, at least for now */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	case KVM_DEV_XICS_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_XICS_NR_SERVERS:
+			return 0;
+		}
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
+{
+	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+	xive_native_configure_irq(hw_num, 0, MASKED, 0);
+}
+
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+
+		if (!state->valid)
+			continue;
+
+		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+		xive_cleanup_irq_data(&state->ipi_data);
+		xive_native_free_irq(state->ipi_number);
+
+		/* Pass-through, cleanup too but keep IRQ hw data */
+		if (state->pt_number)
+			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
+
+		state->valid = false;
+	}
+}
+
+/*
+ * Called when device fd is closed.  kvm->lock is held.
+ */
+static void kvmppc_xive_release(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	pr_devel("Releasing xive device\n");
+
+	/*
+	 * Since this is the device release function, we know that
+	 * userspace does not have any open fd referring to the
+	 * device.  Therefore there can not be any of the device
+	 * attribute set/get functions being executed concurrently,
+	 * and similarly, the connect_vcpu and set/clr_mapped
+	 * functions also cannot be being executed.
+	 */
+
+	debugfs_remove(xive->dentry);
+
+	/*
+	 * We should clean up the vCPU interrupt presenters first.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/*
+		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+		 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
+		 * Holding the vcpu->mutex also means that the vcpu cannot
+		 * be executing the KVM_RUN ioctl, and therefore it cannot
+		 * be executing the XIVE push or pull code or accessing
+		 * the XIVE MMIO regions.
+		 */
+		mutex_lock(&vcpu->mutex);
+		kvmppc_xive_cleanup_vcpu(vcpu);
+		mutex_unlock(&vcpu->mutex);
+	}
+
+	/*
+	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+	 * against xive code getting called during vcpu execution or
+	 * set/get one_reg operations.
+	 */
+	kvm->arch.xive = NULL;
+
+	/* Mask and free interrupts */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_free_sources(xive->src_blocks[i]);
+		kfree(xive->src_blocks[i]);
+		xive->src_blocks[i] = NULL;
+	}
+
+	if (xive->vp_base != XIVE_INVALID_VP)
+		xive_native_free_vp_block(xive->vp_base);
+
+	/*
+	 * A reference of the kvmppc_xive pointer is now kept under
+	 * the xive_devices struct of the machine for reuse. It is
+	 * freed when the VM is destroyed for now until we fix all the
+	 * execution paths.
+	 */
+
+	kfree(dev);
+}
+
+/*
+ * When the guest chooses the interrupt mode (XICS legacy or XIVE
+ * native), the VM will switch of KVM device. The previous device will
+ * be "released" before the new one is created.
+ *
+ * Until we are sure all execution paths are well protected, provide a
+ * fail safe (transitional) method for device destruction, in which
+ * the XIVE device pointer is recycled and not directly freed.
+ */
+struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
+{
+	struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
+		&kvm->arch.xive_devices.native :
+		&kvm->arch.xive_devices.xics_on_xive;
+	struct kvmppc_xive *xive = *kvm_xive_device;
+
+	if (!xive) {
+		xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+		*kvm_xive_device = xive;
+	} else {
+		memset(xive, 0, sizeof(*xive));
+	}
+
+	return xive;
+}
+
+/*
+ * Create a XICS device with XIVE backend.  kvm->lock is held.
+ */
+static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xive *xive;
+	struct kvm *kvm = dev->kvm;
+
+	pr_devel("Creating xive for partition\n");
+
+	/* Already there ? */
+	if (kvm->arch.xive)
+		return -EEXIST;
+
+	xive = kvmppc_xive_get_device(kvm, type);
+	if (!xive)
+		return -ENOMEM;
+
+	dev->private = xive;
+	xive->dev = dev;
+	xive->kvm = kvm;
+	mutex_init(&xive->lock);
+
+	/* We use the default queue size set by the host */
+	xive->q_order = xive_native_default_eq_shift();
+	if (xive->q_order < PAGE_SHIFT)
+		xive->q_page_order = 0;
+	else
+		xive->q_page_order = xive->q_order - PAGE_SHIFT;
+
+	/* VP allocation is delayed to the first call to connect_vcpu */
+	xive->vp_base = XIVE_INVALID_VP;
+	/* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
+	 * on a POWER9 system.
+	 */
+	xive->nr_servers = KVM_MAX_VCPUS;
+
+	if (xive_native_has_single_escalation())
+		xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+
+	if (xive_native_has_save_restore())
+		xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
+
+	kvm->arch.xive = xive;
+	return 0;
+}
+
+int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	/* The VM should have configured XICS mode before doing XICS hcalls. */
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
+
+	switch (req) {
+	case H_XIRR:
+		return xive_vm_h_xirr(vcpu);
+	case H_CPPR:
+		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+	case H_EOI:
+		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+	case H_IPI:
+		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+					  kvmppc_get_gpr(vcpu, 5));
+	case H_IPOLL:
+		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
+	case H_XIRR_X:
+		xive_vm_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
+		return H_SUCCESS;
+	}
+
+	return H_UNSUPPORTED;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
+
+int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	unsigned int i;
+
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+		u32 i0, i1, idx;
+
+		if (!q->qpage && !xc->esc_virq[i])
+			continue;
+
+		if (q->qpage) {
+			seq_printf(m, "    q[%d]: ", i);
+			idx = q->idx;
+			i0 = be32_to_cpup(q->qpage + idx);
+			idx = (idx + 1) & q->msk;
+			i1 = be32_to_cpup(q->qpage + idx);
+			seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
+				   i0, i1);
+		}
+		if (xc->esc_virq[i]) {
+			struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
+			struct xive_irq_data *xd =
+				irq_data_get_irq_handler_data(d);
+			u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+
+			seq_printf(m, "    ESC %d %c%c EOI @%llx",
+				   xc->esc_virq[i],
+				   (pq & XIVE_ESB_VAL_P) ? 'P' : '-',
+				   (pq & XIVE_ESB_VAL_Q) ? 'Q' : '-',
+				   xd->eoi_page);
+			seq_puts(m, "\n");
+		}
+	}
+	return 0;
+}
+
+void kvmppc_xive_debug_show_sources(struct seq_file *m,
+				    struct kvmppc_xive_src_block *sb)
+{
+	int i;
+
+	seq_puts(m, "    LISN      HW/CHIP   TYPE    PQ      EISN    CPU/PRIO\n");
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+		struct xive_irq_data *xd;
+		u64 pq;
+		u32 hw_num;
+
+		if (!state->valid)
+			continue;
+
+		kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+		pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+
+		seq_printf(m, "%08x  %08x/%02x", state->number, hw_num,
+			   xd->src_chip);
+		if (state->lsi)
+			seq_printf(m, " %cLSI", state->asserted ? '^' : ' ');
+		else
+			seq_puts(m, "  MSI");
+
+		seq_printf(m, " %s  %c%c  %08x   % 4d/%d",
+			   state->ipi_number == hw_num ? "IPI" : " PT",
+			   pq & XIVE_ESB_VAL_P ? 'P' : '-',
+			   pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
+			   state->eisn, state->act_server,
+			   state->act_priority);
+
+		seq_puts(m, "\n");
+	}
+}
+
+static int xive_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xive *xive = m->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	u64 t_rm_h_xirr = 0;
+	u64 t_rm_h_ipoll = 0;
+	u64 t_rm_h_cppr = 0;
+	u64 t_rm_h_eoi = 0;
+	u64 t_rm_h_ipi = 0;
+	u64 t_vm_h_xirr = 0;
+	u64 t_vm_h_ipoll = 0;
+	u64 t_vm_h_cppr = 0;
+	u64 t_vm_h_eoi = 0;
+	u64 t_vm_h_ipi = 0;
+	unsigned long i;
+
+	if (!kvm)
+		return 0;
+
+	seq_puts(m, "=========\nVCPU state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		seq_printf(m, "VCPU %d: VP:%#x/%02x\n"
+			 "    CPPR:%#x HWCPPR:%#x MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
+			 xc->server_num, xc->vp_id, xc->vp_chip_id,
+			 xc->cppr, xc->hw_cppr,
+			 xc->mfrr, xc->pending,
+			 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+
+		kvmppc_xive_debug_show_queues(m, vcpu);
+
+		t_rm_h_xirr += xc->stat_rm_h_xirr;
+		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
+		t_rm_h_cppr += xc->stat_rm_h_cppr;
+		t_rm_h_eoi += xc->stat_rm_h_eoi;
+		t_rm_h_ipi += xc->stat_rm_h_ipi;
+		t_vm_h_xirr += xc->stat_vm_h_xirr;
+		t_vm_h_ipoll += xc->stat_vm_h_ipoll;
+		t_vm_h_cppr += xc->stat_vm_h_cppr;
+		t_vm_h_eoi += xc->stat_vm_h_eoi;
+		t_vm_h_ipi += xc->stat_vm_h_ipi;
+	}
+
+	seq_puts(m, "Hcalls totals\n");
+	seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
+	seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
+	seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
+	seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
+	seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
+
+	seq_puts(m, "=========\nSources\n=========\n");
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (sb) {
+			arch_spin_lock(&sb->lock);
+			kvmppc_xive_debug_show_sources(m, sb);
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(xive_debug);
+
+static void xive_debugfs_init(struct kvmppc_xive *xive)
+{
+	xive->dentry = debugfs_create_file("xive", S_IRUGO, xive->kvm->debugfs_dentry,
+					   xive, &xive_debug_fops);
+
+	pr_debug("%s: created\n", __func__);
+}
+
+static void kvmppc_xive_init(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* Register some debug interfaces */
+	xive_debugfs_init(xive);
+}
+
+struct kvm_device_ops kvm_xive_ops = {
+	.name = "kvm-xive",
+	.create = kvmppc_xive_create,
+	.init = kvmppc_xive_init,
+	.release = kvmppc_xive_release,
+	.set_attr = xive_set_attr,
+	.get_attr = xive_get_attr,
+	.has_attr = xive_has_attr,
+};
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
new file mode 100644
index 0000000000..62bf39f537
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -0,0 +1,313 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XIVE_H
+#define _KVM_PPC_BOOK3S_XIVE_H
+
+#ifdef CONFIG_KVM_XICS
+#include "book3s_xics.h"
+
+/*
+ * The XIVE Interrupt source numbers are within the range 0 to
+ * KVMPPC_XICS_NR_IRQS.
+ */
+#define KVMPPC_XIVE_FIRST_IRQ	0
+#define KVMPPC_XIVE_NR_IRQS	KVMPPC_XICS_NR_IRQS
+
+/*
+ * State for one guest irq source.
+ *
+ * For each guest source we allocate a HW interrupt in the XIVE
+ * which we use for all SW triggers. It will be unused for
+ * pass-through but it's easier to keep around as the same
+ * guest interrupt can alternatively be emulated or pass-through
+ * if a physical device is hot unplugged and replaced with an
+ * emulated one.
+ *
+ * This state structure is very similar to the XICS one with
+ * additional XIVE specific tracking.
+ */
+struct kvmppc_xive_irq_state {
+	bool valid;			/* Interrupt entry is valid */
+
+	u32 number;			/* Guest IRQ number */
+	u32 ipi_number;			/* XIVE IPI HW number */
+	struct xive_irq_data ipi_data;	/* XIVE IPI associated data */
+	u32 pt_number;			/* XIVE Pass-through number if any */
+	struct xive_irq_data *pt_data;	/* XIVE Pass-through associated data */
+
+	/* Targetting as set by guest */
+	u8 guest_priority;		/* Guest set priority */
+	u8 saved_priority;		/* Saved priority when masking */
+
+	/* Actual targetting */
+	u32 act_server;			/* Actual server */
+	u8 act_priority;		/* Actual priority */
+
+	/* Various state bits */
+	bool in_eoi;			/* Synchronize with H_EOI */
+	bool old_p;			/* P bit state when masking */
+	bool old_q;			/* Q bit state when masking */
+	bool lsi;			/* level-sensitive interrupt */
+	bool asserted;			/* Only for emulated LSI: current state */
+
+	/* Saved for migration state */
+	bool in_queue;
+	bool saved_p;
+	bool saved_q;
+	u8 saved_scan_prio;
+
+	/* Xive native */
+	u32 eisn;			/* Guest Effective IRQ number */
+};
+
+/* Select the "right" interrupt (IPI vs. passthrough) */
+static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state *state,
+					  u32 *out_hw_irq,
+					  struct xive_irq_data **out_xd)
+{
+	if (state->pt_number) {
+		if (out_hw_irq)
+			*out_hw_irq = state->pt_number;
+		if (out_xd)
+			*out_xd = state->pt_data;
+	} else {
+		if (out_hw_irq)
+			*out_hw_irq = state->ipi_number;
+		if (out_xd)
+			*out_xd = &state->ipi_data;
+	}
+}
+
+/*
+ * This corresponds to an "ICS" in XICS terminology, we use it
+ * as a mean to break up source information into multiple structures.
+ */
+struct kvmppc_xive_src_block {
+	arch_spinlock_t lock;
+	u16 id;
+	struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xive;
+
+struct kvmppc_xive_ops {
+	int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq);
+};
+
+#define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1
+#define KVMPPC_XIVE_FLAG_SAVE_RESTORE 0x2
+
+struct kvmppc_xive {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+
+	/* VP block associated with the VM */
+	u32	vp_base;
+
+	/* Blocks of sources */
+	struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1];
+	u32	max_sbid;
+
+	/*
+	 * For state save, we lazily scan the queues on the first interrupt
+	 * being migrated. We don't have a clean way to reset that flags
+	 * so we keep track of the number of valid sources and how many of
+	 * them were migrated so we can reset when all of them have been
+	 * processed.
+	 */
+	u32	src_count;
+	u32	saved_src_count;
+
+	/*
+	 * Some irqs are delayed on restore until the source is created,
+	 * keep track here of how many of them
+	 */
+	u32	delayed_irqs;
+
+	/* Which queues (priorities) are in use by the guest */
+	u8	qmap;
+
+	/* Queue orders */
+	u32	q_order;
+	u32	q_page_order;
+
+	/* Flags */
+	u8	flags;
+
+	/* Number of entries in the VP block */
+	u32	nr_servers;
+
+	struct kvmppc_xive_ops *ops;
+	struct address_space   *mapping;
+	struct mutex mapping_lock;
+	struct mutex lock;
+};
+
+#define KVMPPC_XIVE_Q_COUNT	8
+
+struct kvmppc_xive_vcpu {
+	struct kvmppc_xive	*xive;
+	struct kvm_vcpu		*vcpu;
+	bool			valid;
+
+	/* Server number. This is the HW CPU ID from a guest perspective */
+	u32			server_num;
+
+	/*
+	 * HW VP corresponding to this VCPU. This is the base of the VP
+	 * block plus the server number.
+	 */
+	u32			vp_id;
+	u32			vp_chip_id;
+	u32			vp_cam;
+
+	/* IPI used for sending ... IPIs */
+	u32			vp_ipi;
+	struct xive_irq_data	vp_ipi_data;
+
+	/* Local emulation state */
+	uint8_t			cppr;	/* guest CPPR */
+	uint8_t			hw_cppr;/* Hardware CPPR */
+	uint8_t			mfrr;
+	uint8_t			pending;
+
+	/* Each VP has 8 queues though we only provision some */
+	struct xive_q		queues[KVMPPC_XIVE_Q_COUNT];
+	u32			esc_virq[KVMPPC_XIVE_Q_COUNT];
+	char			*esc_virq_names[KVMPPC_XIVE_Q_COUNT];
+
+	/* Stash a delayed irq on restore from migration (see set_icp) */
+	u32			delayed_irq;
+
+	/* Stats */
+	u64			stat_rm_h_xirr;
+	u64			stat_rm_h_ipoll;
+	u64			stat_rm_h_cppr;
+	u64			stat_rm_h_eoi;
+	u64			stat_rm_h_ipi;
+	u64			stat_vm_h_xirr;
+	u64			stat_vm_h_ipoll;
+	u64			stat_vm_h_cppr;
+	u64			stat_vm_h_eoi;
+	u64			stat_vm_h_ipi;
+};
+
+static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num)
+			return vcpu;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmppc_xive *xive,
+		u32 irq, u16 *source)
+{
+	u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+
+	if (source)
+		*source = src;
+	if (bid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	return xive->src_blocks[bid];
+}
+
+/*
+ * When the XIVE resources are allocated at the HW level, the VP
+ * structures describing the vCPUs of a guest are distributed among
+ * the chips to optimize the PowerBUS usage. For best performance, the
+ * guest vCPUs can be pinned to match the VP structure distribution.
+ *
+ * Currently, the VP identifiers are deduced from the vCPU id using
+ * the kvmppc_pack_vcpu_id() routine which is not incorrect but not
+ * optimal either. It VSMT is used, the result is not continuous and
+ * the constraints on HW resources described above can not be met.
+ */
+static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server)
+{
+	return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
+}
+
+static inline bool kvmppc_xive_vp_in_use(struct kvm *kvm, u32 vp_id)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.xive_vcpu && vp_id == vcpu->arch.xive_vcpu->vp_id)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Mapping between guest priorities and host priorities
+ * is as follow.
+ *
+ * Guest request for 0...6 are honored. Guest request for anything
+ * higher results in a priority of 6 being applied.
+ *
+ * Similar mapping is done for CPPR values
+ */
+static inline u8 xive_prio_from_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 6)
+		return prio;
+	return 6;
+}
+
+static inline u8 xive_prio_to_guest(u8 prio)
+{
+	return prio;
+}
+
+static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
+{
+	u32 cur;
+
+	if (!qpage)
+		return 0;
+	cur = be32_to_cpup(qpage + *idx);
+	if ((cur >> 31) == *toggle)
+		return 0;
+	*idx = (*idx + 1) & msk;
+	if (*idx == 0)
+		(*toggle) ^= 1;
+	return cur & 0x7fffffff;
+}
+
+/*
+ * Common Xive routines for XICS-over-XIVE and XIVE native
+ */
+void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
+int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
+void kvmppc_xive_debug_show_sources(struct seq_file *m,
+				    struct kvmppc_xive_src_block *sb);
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+	struct kvmppc_xive *xive, int irq);
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
+int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
+int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
+				  bool single_escalation);
+struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int irq);
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp);
+int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr);
+bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu);
+
+static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive)
+{
+	return xive->flags & KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+}
+
+#endif /* CONFIG_KVM_XICS */
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
new file mode 100644
index 0000000000..712ab91ced
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -0,0 +1,1284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017-2019, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "xive-kvm: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/irqdomain.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/debug.h>
+#include <asm/opal.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xive.h"
+
+static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	/*
+	 * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
+	 * load operation, so there is no need to enforce load-after-store
+	 * ordering.
+	 */
+
+	val = in_be64(xd->eoi_mmio + offset);
+	return (u8)val;
+}
+
+static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q = &xc->queues[prio];
+
+	xive_native_disable_queue(xc->vp_id, q, prio);
+	if (q->qpage) {
+		put_page(virt_to_page(q->qpage));
+		q->qpage = NULL;
+	}
+}
+
+static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
+					      u8 prio, __be32 *qpage,
+					      u32 order, bool can_escalate)
+{
+	int rc;
+	__be32 *qpage_prev = q->qpage;
+
+	rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
+					 can_escalate);
+	if (rc)
+		return rc;
+
+	if (qpage_prev)
+		put_page(virt_to_page(qpage_prev));
+
+	return rc;
+}
+
+void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	int i;
+
+	if (!kvmppc_xive_enabled(vcpu))
+		return;
+
+	if (!xc)
+		return;
+
+	pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
+
+	/* Ensure no interrupt is still routed to that VP */
+	xc->valid = false;
+	kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+	/* Free escalations */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		/* Free the escalation irq */
+		if (xc->esc_virq[i]) {
+			if (kvmppc_xive_has_single_escalation(xc->xive))
+				xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]);
+			free_irq(xc->esc_virq[i], vcpu);
+			irq_dispose_mapping(xc->esc_virq[i]);
+			kfree(xc->esc_virq_names[i]);
+			xc->esc_virq[i] = 0;
+		}
+	}
+
+	/* Disable the VP */
+	xive_native_disable_vp(xc->vp_id);
+
+	/* Clear the cam word so guest entry won't try to push context */
+	vcpu->arch.xive_cam_word = 0;
+
+	/* Free the queues */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		kvmppc_xive_native_cleanup_queue(vcpu, i);
+	}
+
+	/* Free the VP */
+	kfree(xc);
+
+	/* Cleanup the vcpu */
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+	vcpu->arch.xive_vcpu = NULL;
+}
+
+int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
+				    struct kvm_vcpu *vcpu, u32 server_num)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvmppc_xive_vcpu *xc = NULL;
+	int rc;
+	u32 vp_id;
+
+	pr_devel("native_connect_vcpu(server=%d)\n", server_num);
+
+	if (dev->ops != &kvm_xive_native_ops) {
+		pr_devel("Wrong ops !\n");
+		return -EPERM;
+	}
+	if (xive->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
+		return -EBUSY;
+
+	mutex_lock(&xive->lock);
+
+	rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
+	if (rc)
+		goto bail;
+
+	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+	if (!xc) {
+		rc = -ENOMEM;
+		goto bail;
+	}
+
+	vcpu->arch.xive_vcpu = xc;
+	xc->xive = xive;
+	xc->vcpu = vcpu;
+	xc->server_num = server_num;
+
+	xc->vp_id = vp_id;
+	xc->valid = true;
+	vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
+
+	rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
+	if (rc) {
+		pr_err("Failed to get VP info from OPAL: %d\n", rc);
+		goto bail;
+	}
+
+	if (!kvmppc_xive_check_save_restore(vcpu)) {
+		pr_err("inconsistent save-restore setup for VCPU %d\n", server_num);
+		rc = -EIO;
+		goto bail;
+	}
+
+	/*
+	 * Enable the VP first as the single escalation mode will
+	 * affect escalation interrupts numbering
+	 */
+	rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive));
+	if (rc) {
+		pr_err("Failed to enable VP in OPAL: %d\n", rc);
+		goto bail;
+	}
+
+	/* Configure VCPU fields for use by assembly push/pull */
+	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
+	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
+
+	/* TODO: reset all queues to a clean state ? */
+bail:
+	mutex_unlock(&xive->lock);
+	if (rc)
+		kvmppc_xive_native_cleanup_vcpu(vcpu);
+
+	return rc;
+}
+
+/*
+ * Device passthrough support
+ */
+static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
+
+	if (irq >= KVMPPC_XIVE_NR_IRQS)
+		return -EINVAL;
+
+	/*
+	 * Clear the ESB pages of the IRQ number being mapped (or
+	 * unmapped) into the guest and let the VM fault handler
+	 * repopulate with the appropriate ESB pages (device or IC)
+	 */
+	pr_debug("clearing esb pages for girq 0x%lx\n", irq);
+	mutex_lock(&xive->mapping_lock);
+	if (xive->mapping)
+		unmap_mapping_range(xive->mapping,
+				    esb_pgoff << PAGE_SHIFT,
+				    2ull << PAGE_SHIFT, 1);
+	mutex_unlock(&xive->mapping_lock);
+	return 0;
+}
+
+static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
+	.reset_mapped = kvmppc_xive_native_reset_mapped,
+};
+
+static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct kvm_device *dev = vma->vm_file->private_data;
+	struct kvmppc_xive *xive = dev->private;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u16 src;
+	u64 page;
+	unsigned long irq;
+	u64 page_offset;
+
+	/*
+	 * Linux/KVM uses a two pages ESB setting, one for trigger and
+	 * one for EOI
+	 */
+	page_offset = vmf->pgoff - vma->vm_pgoff;
+	irq = page_offset / 2;
+
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb) {
+		pr_devel("%s: source %lx not found !\n", __func__, irq);
+		return VM_FAULT_SIGBUS;
+	}
+
+	state = &sb->irq_state[src];
+
+	/* Some sanity checking */
+	if (!state->valid) {
+		pr_devel("%s: source %lx invalid !\n", __func__, irq);
+		return VM_FAULT_SIGBUS;
+	}
+
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	arch_spin_lock(&sb->lock);
+
+	/*
+	 * first/even page is for trigger
+	 * second/odd page is for EOI and management.
+	 */
+	page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
+	arch_spin_unlock(&sb->lock);
+
+	if (WARN_ON(!page)) {
+		pr_err("%s: accessing invalid ESB page for source %lx !\n",
+		       __func__, irq);
+		return VM_FAULT_SIGBUS;
+	}
+
+	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
+	return VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct xive_native_esb_vmops = {
+	.fault = xive_native_esb_fault,
+};
+
+static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+
+	switch (vmf->pgoff - vma->vm_pgoff) {
+	case 0: /* HW - forbid access */
+	case 1: /* HV - forbid access */
+		return VM_FAULT_SIGBUS;
+	case 2: /* OS */
+		vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
+		return VM_FAULT_NOPAGE;
+	case 3: /* USER - TODO */
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+}
+
+static const struct vm_operations_struct xive_native_tima_vmops = {
+	.fault = xive_native_tima_fault,
+};
+
+static int kvmppc_xive_native_mmap(struct kvm_device *dev,
+				   struct vm_area_struct *vma)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We only allow mappings at fixed offset for now */
+	if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
+		if (vma_pages(vma) > 4)
+			return -EINVAL;
+		vma->vm_ops = &xive_native_tima_vmops;
+	} else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
+		if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
+			return -EINVAL;
+		vma->vm_ops = &xive_native_esb_vmops;
+	} else {
+		return -EINVAL;
+	}
+
+	vm_flags_set(vma, VM_IO | VM_PFNMAP);
+	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
+
+	/*
+	 * Grab the KVM device file address_space to be able to clear
+	 * the ESB pages mapping when a device is passed-through into
+	 * the guest.
+	 */
+	xive->mapping = vma->vm_file->f_mapping;
+	return 0;
+}
+
+static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
+					 u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u64 val;
+	u16 idx;
+	int rc;
+
+	pr_devel("%s irq=0x%lx\n", __func__, irq);
+
+	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
+		return -E2BIG;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb) {
+		pr_debug("No source, creating source block...\n");
+		sb = kvmppc_xive_create_src_block(xive, irq);
+		if (!sb) {
+			pr_err("Failed to create block...\n");
+			return -ENOMEM;
+		}
+	}
+	state = &sb->irq_state[idx];
+
+	if (get_user(val, ubufp)) {
+		pr_err("fault getting user info !\n");
+		return -EFAULT;
+	}
+
+	arch_spin_lock(&sb->lock);
+
+	/*
+	 * If the source doesn't already have an IPI, allocate
+	 * one and get the corresponding data
+	 */
+	if (!state->ipi_number) {
+		state->ipi_number = xive_native_alloc_irq();
+		if (state->ipi_number == 0) {
+			pr_err("Failed to allocate IRQ !\n");
+			rc = -ENXIO;
+			goto unlock;
+		}
+		xive_native_populate_irq_data(state->ipi_number,
+					      &state->ipi_data);
+		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
+			 state->ipi_number, irq);
+	}
+
+	/* Restore LSI state */
+	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
+		state->lsi = true;
+		if (val & KVM_XIVE_LEVEL_ASSERTED)
+			state->asserted = true;
+		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
+	}
+
+	/* Mask IRQ to start with */
+	state->act_server = 0;
+	state->act_priority = MASKED;
+	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+
+	/* Increment the number of valid sources and mark this one valid */
+	if (!state->valid)
+		xive->src_count++;
+	state->valid = true;
+
+	rc = 0;
+
+unlock:
+	arch_spin_unlock(&sb->lock);
+
+	return rc;
+}
+
+static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
+					struct kvmppc_xive_src_block *sb,
+					struct kvmppc_xive_irq_state *state,
+					u32 server, u8 priority, bool masked,
+					u32 eisn)
+{
+	struct kvm *kvm = xive->kvm;
+	u32 hw_num;
+	int rc = 0;
+
+	arch_spin_lock(&sb->lock);
+
+	if (state->act_server == server && state->act_priority == priority &&
+	    state->eisn == eisn)
+		goto unlock;
+
+	pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
+		 priority, server, masked, state->act_server,
+		 state->act_priority);
+
+	kvmppc_xive_select_irq(state, &hw_num, NULL);
+
+	if (priority != MASKED && !masked) {
+		rc = kvmppc_xive_select_target(kvm, &server, priority);
+		if (rc)
+			goto unlock;
+
+		state->act_priority = priority;
+		state->act_server = server;
+		state->eisn = eisn;
+
+		rc = xive_native_configure_irq(hw_num,
+					       kvmppc_xive_vp(xive, server),
+					       priority, eisn);
+	} else {
+		state->act_priority = MASKED;
+		state->act_server = 0;
+		state->eisn = 0;
+
+		rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
+	}
+
+unlock:
+	arch_spin_unlock(&sb->lock);
+	return rc;
+}
+
+static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
+						long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 src;
+	u64 kvm_cfg;
+	u32 server;
+	u8 priority;
+	bool masked;
+	u32 eisn;
+
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb)
+		return -ENOENT;
+
+	state = &sb->irq_state[src];
+
+	if (!state->valid)
+		return -EINVAL;
+
+	if (get_user(kvm_cfg, ubufp))
+		return -EFAULT;
+
+	pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
+
+	priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
+		KVM_XIVE_SOURCE_PRIORITY_SHIFT;
+	server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
+		KVM_XIVE_SOURCE_SERVER_SHIFT;
+	masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
+		KVM_XIVE_SOURCE_MASKED_SHIFT;
+	eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
+		KVM_XIVE_SOURCE_EISN_SHIFT;
+
+	if (priority != xive_prio_from_guest(priority)) {
+		pr_err("invalid priority for queue %d for VCPU %d\n",
+		       priority, server);
+		return -EINVAL;
+	}
+
+	return kvmppc_xive_native_update_source_config(xive, sb, state, server,
+						       priority, masked, eisn);
+}
+
+static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
+					  long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u16 src;
+	int rc = 0;
+
+	pr_devel("%s irq=0x%lx", __func__, irq);
+
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb)
+		return -ENOENT;
+
+	state = &sb->irq_state[src];
+
+	rc = -EINVAL;
+
+	arch_spin_lock(&sb->lock);
+
+	if (state->valid) {
+		kvmppc_xive_select_irq(state, &hw_num, &xd);
+		xive_native_sync_source(hw_num);
+		rc = 0;
+	}
+
+	arch_spin_unlock(&sb->lock);
+	return rc;
+}
+
+static int xive_native_validate_queue_size(u32 qshift)
+{
+	/*
+	 * We only support 64K pages for the moment. This is also
+	 * advertised in the DT property "ibm,xive-eq-sizes"
+	 */
+	switch (qshift) {
+	case 0: /* EQ reset */
+	case 16:
+		return 0;
+	case 12:
+	case 21:
+	case 24:
+	default:
+		return -EINVAL;
+	}
+}
+
+static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
+					       long eq_idx, u64 addr)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_xive_vcpu *xc;
+	void __user *ubufp = (void __user *) addr;
+	u32 server;
+	u8 priority;
+	struct kvm_ppc_xive_eq kvm_eq;
+	int rc;
+	__be32 *qaddr = 0;
+	struct page *page;
+	struct xive_q *q;
+	gfn_t gfn;
+	unsigned long page_size;
+	int srcu_idx;
+
+	/*
+	 * Demangle priority/server tuple from the EQ identifier
+	 */
+	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
+		KVM_XIVE_EQ_PRIORITY_SHIFT;
+	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
+		KVM_XIVE_EQ_SERVER_SHIFT;
+
+	if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
+		return -EFAULT;
+
+	vcpu = kvmppc_xive_find_server(kvm, server);
+	if (!vcpu) {
+		pr_err("Can't find server %d\n", server);
+		return -ENOENT;
+	}
+	xc = vcpu->arch.xive_vcpu;
+
+	if (priority != xive_prio_from_guest(priority)) {
+		pr_err("Trying to restore invalid queue %d for VCPU %d\n",
+		       priority, server);
+		return -EINVAL;
+	}
+	q = &xc->queues[priority];
+
+	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
+		 __func__, server, priority, kvm_eq.flags,
+		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
+
+	/* reset queue and disable queueing */
+	if (!kvm_eq.qshift) {
+		q->guest_qaddr  = 0;
+		q->guest_qshift = 0;
+
+		rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
+							NULL, 0, true);
+		if (rc) {
+			pr_err("Failed to reset queue %d for VCPU %d: %d\n",
+			       priority, xc->server_num, rc);
+			return rc;
+		}
+
+		return 0;
+	}
+
+	/*
+	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
+	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
+	 * without using the coalescing mechanisms provided by the
+	 * XIVE END ESBs. This is required on KVM as notification
+	 * using the END ESBs is not supported.
+	 */
+	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
+		pr_err("invalid flags %d\n", kvm_eq.flags);
+		return -EINVAL;
+	}
+
+	rc = xive_native_validate_queue_size(kvm_eq.qshift);
+	if (rc) {
+		pr_err("invalid queue size %d\n", kvm_eq.qshift);
+		return rc;
+	}
+
+	if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
+		pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
+		       1ull << kvm_eq.qshift);
+		return -EINVAL;
+	}
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	gfn = gpa_to_gfn(kvm_eq.qaddr);
+
+	page_size = kvm_host_page_size(vcpu, gfn);
+	if (1ull << kvm_eq.qshift > page_size) {
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+		pr_warn("Incompatible host page size %lx!\n", page_size);
+		return -EINVAL;
+	}
+
+	page = gfn_to_page(kvm, gfn);
+	if (is_error_page(page)) {
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
+		return -EINVAL;
+	}
+
+	qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	/*
+	 * Backup the queue page guest address to the mark EQ page
+	 * dirty for migration.
+	 */
+	q->guest_qaddr  = kvm_eq.qaddr;
+	q->guest_qshift = kvm_eq.qshift;
+
+	 /*
+	  * Unconditional Notification is forced by default at the
+	  * OPAL level because the use of END ESBs is not supported by
+	  * Linux.
+	  */
+	rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
+					(__be32 *) qaddr, kvm_eq.qshift, true);
+	if (rc) {
+		pr_err("Failed to configure queue %d for VCPU %d: %d\n",
+		       priority, xc->server_num, rc);
+		put_page(page);
+		return rc;
+	}
+
+	/*
+	 * Only restore the queue state when needed. When doing the
+	 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
+	 */
+	if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
+		rc = xive_native_set_queue_state(xc->vp_id, priority,
+						 kvm_eq.qtoggle,
+						 kvm_eq.qindex);
+		if (rc)
+			goto error;
+	}
+
+	rc = kvmppc_xive_attach_escalation(vcpu, priority,
+					   kvmppc_xive_has_single_escalation(xive));
+error:
+	if (rc)
+		kvmppc_xive_native_cleanup_queue(vcpu, priority);
+	return rc;
+}
+
+static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
+					       long eq_idx, u64 addr)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_xive_vcpu *xc;
+	struct xive_q *q;
+	void __user *ubufp = (u64 __user *) addr;
+	u32 server;
+	u8 priority;
+	struct kvm_ppc_xive_eq kvm_eq;
+	u64 qaddr;
+	u64 qshift;
+	u64 qeoi_page;
+	u32 escalate_irq;
+	u64 qflags;
+	int rc;
+
+	/*
+	 * Demangle priority/server tuple from the EQ identifier
+	 */
+	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
+		KVM_XIVE_EQ_PRIORITY_SHIFT;
+	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
+		KVM_XIVE_EQ_SERVER_SHIFT;
+
+	vcpu = kvmppc_xive_find_server(kvm, server);
+	if (!vcpu) {
+		pr_err("Can't find server %d\n", server);
+		return -ENOENT;
+	}
+	xc = vcpu->arch.xive_vcpu;
+
+	if (priority != xive_prio_from_guest(priority)) {
+		pr_err("invalid priority for queue %d for VCPU %d\n",
+		       priority, server);
+		return -EINVAL;
+	}
+	q = &xc->queues[priority];
+
+	memset(&kvm_eq, 0, sizeof(kvm_eq));
+
+	if (!q->qpage)
+		return 0;
+
+	rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
+					&qeoi_page, &escalate_irq, &qflags);
+	if (rc)
+		return rc;
+
+	kvm_eq.flags = 0;
+	if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
+		kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
+
+	kvm_eq.qshift = q->guest_qshift;
+	kvm_eq.qaddr  = q->guest_qaddr;
+
+	rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
+					 &kvm_eq.qindex);
+	if (rc)
+		return rc;
+
+	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
+		 __func__, server, priority, kvm_eq.flags,
+		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
+
+	if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+
+		if (!state->valid)
+			continue;
+
+		if (state->act_priority == MASKED)
+			continue;
+
+		state->eisn = 0;
+		state->act_server = 0;
+		state->act_priority = MASKED;
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+		xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+		if (state->pt_number) {
+			xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
+			xive_native_configure_irq(state->pt_number,
+						  0, MASKED, 0);
+		}
+	}
+}
+
+static int kvmppc_xive_reset(struct kvmppc_xive *xive)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	pr_devel("%s\n", __func__);
+
+	mutex_lock(&xive->lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		unsigned int prio;
+
+		if (!xc)
+			continue;
+
+		kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+		for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+
+			/* Single escalation, no queue 7 */
+			if (prio == 7 && kvmppc_xive_has_single_escalation(xive))
+				break;
+
+			if (xc->esc_virq[prio]) {
+				free_irq(xc->esc_virq[prio], vcpu);
+				irq_dispose_mapping(xc->esc_virq[prio]);
+				kfree(xc->esc_virq_names[prio]);
+				xc->esc_virq[prio] = 0;
+			}
+
+			kvmppc_xive_native_cleanup_queue(vcpu, prio);
+		}
+	}
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (sb) {
+			arch_spin_lock(&sb->lock);
+			kvmppc_xive_reset_sources(sb);
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+
+	mutex_unlock(&xive->lock);
+
+	return 0;
+}
+
+static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
+{
+	int j;
+
+	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+		struct xive_irq_data *xd;
+		u32 hw_num;
+
+		if (!state->valid)
+			continue;
+
+		/*
+		 * The struct kvmppc_xive_irq_state reflects the state
+		 * of the EAS configuration and not the state of the
+		 * source. The source is masked setting the PQ bits to
+		 * '-Q', which is what is being done before calling
+		 * the KVM_DEV_XIVE_EQ_SYNC control.
+		 *
+		 * If a source EAS is configured, OPAL syncs the XIVE
+		 * IC of the source and the XIVE IC of the previous
+		 * target if any.
+		 *
+		 * So it should be fine ignoring MASKED sources as
+		 * they have been synced already.
+		 */
+		if (state->act_priority == MASKED)
+			continue;
+
+		kvmppc_xive_select_irq(state, &hw_num, &xd);
+		xive_native_sync_source(hw_num);
+		xive_native_sync_queue(hw_num);
+	}
+}
+
+static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	unsigned int prio;
+	int srcu_idx;
+
+	if (!xc)
+		return -ENOENT;
+
+	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+		struct xive_q *q = &xc->queues[prio];
+
+		if (!q->qpage)
+			continue;
+
+		/* Mark EQ page dirty for migration */
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+	}
+	return 0;
+}
+
+static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	pr_devel("%s\n", __func__);
+
+	mutex_lock(&xive->lock);
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (sb) {
+			arch_spin_lock(&sb->lock);
+			kvmppc_xive_native_sync_sources(sb);
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvmppc_xive_native_vcpu_eq_sync(vcpu);
+	}
+	mutex_unlock(&xive->lock);
+
+	return 0;
+}
+
+static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
+				       struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XIVE_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_XIVE_RESET:
+			return kvmppc_xive_reset(xive);
+		case KVM_DEV_XIVE_EQ_SYNC:
+			return kvmppc_xive_native_eq_sync(xive);
+		case KVM_DEV_XIVE_NR_SERVERS:
+			return kvmppc_xive_set_nr_servers(xive, attr->addr);
+		}
+		break;
+	case KVM_DEV_XIVE_GRP_SOURCE:
+		return kvmppc_xive_native_set_source(xive, attr->attr,
+						     attr->addr);
+	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
+		return kvmppc_xive_native_set_source_config(xive, attr->attr,
+							    attr->addr);
+	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
+		return kvmppc_xive_native_set_queue_config(xive, attr->attr,
+							   attr->addr);
+	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
+		return kvmppc_xive_native_sync_source(xive, attr->attr,
+						      attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
+				       struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
+		return kvmppc_xive_native_get_queue_config(xive, attr->attr,
+							   attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
+				       struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_XIVE_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_XIVE_RESET:
+		case KVM_DEV_XIVE_EQ_SYNC:
+		case KVM_DEV_XIVE_NR_SERVERS:
+			return 0;
+		}
+		break;
+	case KVM_DEV_XIVE_GRP_SOURCE:
+	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
+	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
+		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XIVE_NR_IRQS)
+			return 0;
+		break;
+	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
+		return 0;
+	}
+	return -ENXIO;
+}
+
+/*
+ * Called when device fd is closed.  kvm->lock is held.
+ */
+static void kvmppc_xive_native_release(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	pr_devel("Releasing xive native device\n");
+
+	/*
+	 * Clear the KVM device file address_space which is used to
+	 * unmap the ESB pages when a device is passed-through.
+	 */
+	mutex_lock(&xive->mapping_lock);
+	xive->mapping = NULL;
+	mutex_unlock(&xive->mapping_lock);
+
+	/*
+	 * Since this is the device release function, we know that
+	 * userspace does not have any open fd or mmap referring to
+	 * the device.  Therefore there can not be any of the
+	 * device attribute set/get, mmap, or page fault functions
+	 * being executed concurrently, and similarly, the
+	 * connect_vcpu and set/clr_mapped functions also cannot
+	 * be being executed.
+	 */
+
+	debugfs_remove(xive->dentry);
+
+	/*
+	 * We should clean up the vCPU interrupt presenters first.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/*
+		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+		 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
+		 * Holding the vcpu->mutex also means that the vcpu cannot
+		 * be executing the KVM_RUN ioctl, and therefore it cannot
+		 * be executing the XIVE push or pull code or accessing
+		 * the XIVE MMIO regions.
+		 */
+		mutex_lock(&vcpu->mutex);
+		kvmppc_xive_native_cleanup_vcpu(vcpu);
+		mutex_unlock(&vcpu->mutex);
+	}
+
+	/*
+	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+	 * against xive code getting called during vcpu execution or
+	 * set/get one_reg operations.
+	 */
+	kvm->arch.xive = NULL;
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_free_sources(xive->src_blocks[i]);
+		kfree(xive->src_blocks[i]);
+		xive->src_blocks[i] = NULL;
+	}
+
+	if (xive->vp_base != XIVE_INVALID_VP)
+		xive_native_free_vp_block(xive->vp_base);
+
+	/*
+	 * A reference of the kvmppc_xive pointer is now kept under
+	 * the xive_devices struct of the machine for reuse. It is
+	 * freed when the VM is destroyed for now until we fix all the
+	 * execution paths.
+	 */
+
+	kfree(dev);
+}
+
+/*
+ * Create a XIVE device.  kvm->lock is held.
+ */
+static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xive *xive;
+	struct kvm *kvm = dev->kvm;
+
+	pr_devel("Creating xive native device\n");
+
+	if (kvm->arch.xive)
+		return -EEXIST;
+
+	xive = kvmppc_xive_get_device(kvm, type);
+	if (!xive)
+		return -ENOMEM;
+
+	dev->private = xive;
+	xive->dev = dev;
+	xive->kvm = kvm;
+	mutex_init(&xive->mapping_lock);
+	mutex_init(&xive->lock);
+
+	/* VP allocation is delayed to the first call to connect_vcpu */
+	xive->vp_base = XIVE_INVALID_VP;
+	/* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
+	 * on a POWER9 system.
+	 */
+	xive->nr_servers = KVM_MAX_VCPUS;
+
+	if (xive_native_has_single_escalation())
+		xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+
+	if (xive_native_has_save_restore())
+		xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE;
+
+	xive->ops = &kvmppc_xive_native_ops;
+
+	kvm->arch.xive = xive;
+	return 0;
+}
+
+/*
+ * Interrupt Pending Buffer (IPB) offset
+ */
+#define TM_IPB_SHIFT 40
+#define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
+
+int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u64 opal_state;
+	int rc;
+
+	if (!kvmppc_xive_enabled(vcpu))
+		return -EPERM;
+
+	if (!xc)
+		return -ENOENT;
+
+	/* Thread context registers. We only care about IPB and CPPR */
+	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
+
+	/* Get the VP state from OPAL */
+	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
+	if (rc)
+		return rc;
+
+	/*
+	 * Capture the backup of IPB register in the NVT structure and
+	 * merge it in our KVM VP state.
+	 */
+	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
+
+	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
+		 __func__,
+		 vcpu->arch.xive_saved_state.nsr,
+		 vcpu->arch.xive_saved_state.cppr,
+		 vcpu->arch.xive_saved_state.ipb,
+		 vcpu->arch.xive_saved_state.pipr,
+		 vcpu->arch.xive_saved_state.w01,
+		 (u32) vcpu->arch.xive_cam_word, opal_state);
+
+	return 0;
+}
+
+int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+
+	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
+		 val->xive_timaval[0], val->xive_timaval[1]);
+
+	if (!kvmppc_xive_enabled(vcpu))
+		return -EPERM;
+
+	if (!xc || !xive)
+		return -ENOENT;
+
+	/* We can't update the state of a "pushed" VCPU	 */
+	if (WARN_ON(vcpu->arch.xive_pushed))
+		return -EBUSY;
+
+	/*
+	 * Restore the thread context registers. IPB and CPPR should
+	 * be the only ones that matter.
+	 */
+	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
+
+	/*
+	 * There is no need to restore the XIVE internal state (IPB
+	 * stored in the NVT) as the IPB register was merged in KVM VP
+	 * state when captured.
+	 */
+	return 0;
+}
+
+bool kvmppc_xive_native_supported(void)
+{
+	return xive_native_has_queue_state_support();
+}
+
+static int xive_native_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xive *xive = m->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	if (!kvm)
+		return 0;
+
+	seq_puts(m, "=========\nVCPU state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		seq_printf(m, "VCPU %d: VP=%#x/%02x\n"
+			   "    NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
+			   xc->server_num, xc->vp_id, xc->vp_chip_id,
+			   vcpu->arch.xive_saved_state.nsr,
+			   vcpu->arch.xive_saved_state.cppr,
+			   vcpu->arch.xive_saved_state.ipb,
+			   vcpu->arch.xive_saved_state.pipr,
+			   be64_to_cpu(vcpu->arch.xive_saved_state.w01),
+			   be32_to_cpu(vcpu->arch.xive_cam_word));
+
+		kvmppc_xive_debug_show_queues(m, vcpu);
+	}
+
+	seq_puts(m, "=========\nSources\n=========\n");
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (sb) {
+			arch_spin_lock(&sb->lock);
+			kvmppc_xive_debug_show_sources(m, sb);
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
+
+static void xive_native_debugfs_init(struct kvmppc_xive *xive)
+{
+	xive->dentry = debugfs_create_file("xive", 0444, xive->kvm->debugfs_dentry,
+					   xive, &xive_native_debug_fops);
+
+	pr_debug("%s: created\n", __func__);
+}
+
+static void kvmppc_xive_native_init(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* Register some debug interfaces */
+	xive_native_debugfs_init(xive);
+}
+
+struct kvm_device_ops kvm_xive_native_ops = {
+	.name = "kvm-xive-native",
+	.create = kvmppc_xive_native_create,
+	.init = kvmppc_xive_native_init,
+	.release = kvmppc_xive_native_release,
+	.set_attr = kvmppc_xive_native_set_attr,
+	.get_attr = kvmppc_xive_native_get_attr,
+	.has_attr = kvmppc_xive_native_has_attr,
+	.mmap = kvmppc_xive_native_mmap,
+};
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
new file mode 100644
index 0000000000..6a5be025a8
--- /dev/null
+++ b/arch/powerpc/kvm/booke.c
@@ -0,0 +1,2242 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ *          Scott Wood <scottwood@freescale.com>
+ *          Varun Sethi <varun.sethi@freescale.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+
+#include <asm/cputable.h>
+#include <linux/uaccess.h>
+#include <asm/interrupt.h>
+#include <asm/kvm_ppc.h>
+#include <asm/cacheflush.h>
+#include <asm/dbell.h>
+#include <asm/hw_irq.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+
+#include "timing.h"
+#include "booke.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_booke.h"
+
+unsigned long kvmppc_booke_handlers;
+
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_ICOUNTER(VM, num_2M_pages),
+	STATS_DESC_ICOUNTER(VM, num_1G_pages)
+};
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, sum_exits),
+	STATS_DESC_COUNTER(VCPU, mmio_exits),
+	STATS_DESC_COUNTER(VCPU, signal_exits),
+	STATS_DESC_COUNTER(VCPU, light_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, syscall_exits),
+	STATS_DESC_COUNTER(VCPU, isi_exits),
+	STATS_DESC_COUNTER(VCPU, dsi_exits),
+	STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
+	STATS_DESC_COUNTER(VCPU, dec_exits),
+	STATS_DESC_COUNTER(VCPU, ext_intr_exits),
+	STATS_DESC_COUNTER(VCPU, halt_successful_wait),
+	STATS_DESC_COUNTER(VCPU, dbell_exits),
+	STATS_DESC_COUNTER(VCPU, gdbell_exits),
+	STATS_DESC_COUNTER(VCPU, ld),
+	STATS_DESC_COUNTER(VCPU, st),
+	STATS_DESC_COUNTER(VCPU, pthru_all),
+	STATS_DESC_COUNTER(VCPU, pthru_host),
+	STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
+/* TODO: use vcpu_printf() */
+void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	printk("pc:   %08lx msr:  %08llx\n", vcpu->arch.regs.nip,
+			vcpu->arch.shared->msr);
+	printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.regs.link,
+			vcpu->arch.regs.ctr);
+	printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0,
+					    vcpu->arch.shared->srr1);
+
+	printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
+
+	for (i = 0; i < 32; i += 4) {
+		printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
+		       kvmppc_get_gpr(vcpu, i),
+		       kvmppc_get_gpr(vcpu, i+1),
+		       kvmppc_get_gpr(vcpu, i+2),
+		       kvmppc_get_gpr(vcpu, i+3));
+	}
+}
+
+#ifdef CONFIG_SPE
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	enable_kernel_spe();
+	kvmppc_save_guest_spe(vcpu);
+	disable_kernel_spe();
+	vcpu->arch.shadow_msr &= ~MSR_SPE;
+	preempt_enable();
+}
+
+static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	enable_kernel_spe();
+	kvmppc_load_guest_spe(vcpu);
+	disable_kernel_spe();
+	vcpu->arch.shadow_msr |= MSR_SPE;
+	preempt_enable();
+}
+
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.shared->msr & MSR_SPE) {
+		if (!(vcpu->arch.shadow_msr & MSR_SPE))
+			kvmppc_vcpu_enable_spe(vcpu);
+	} else if (vcpu->arch.shadow_msr & MSR_SPE) {
+		kvmppc_vcpu_disable_spe(vcpu);
+	}
+}
+#else
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
+/*
+ * Load up guest vcpu FP state if it's needed.
+ * It also set the MSR_FP in thread so that host know
+ * we're holding FPU, and then host can help to save
+ * guest vcpu FP state if other threads require to use FPU.
+ * This simulates an FP unavailable fault.
+ *
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_FPU
+	if (!(current->thread.regs->msr & MSR_FP)) {
+		enable_kernel_fp();
+		load_fp_state(&vcpu->arch.fp);
+		disable_kernel_fp();
+		current->thread.fp_save_area = &vcpu->arch.fp;
+		current->thread.regs->msr |= MSR_FP;
+	}
+#endif
+}
+
+/*
+ * Save guest vcpu FP state into thread.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_FPU
+	if (current->thread.regs->msr & MSR_FP)
+		giveup_fpu(current);
+	current->thread.fp_save_area = NULL;
+#endif
+}
+
+static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV)
+	/* We always treat the FP bit as enabled from the host
+	   perspective, so only need to adjust the shadow MSR */
+	vcpu->arch.shadow_msr &= ~MSR_FP;
+	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP;
+#endif
+}
+
+/*
+ * Simulate AltiVec unavailable fault to load guest state
+ * from thread to AltiVec unit.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_load_guest_altivec(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
+		if (!(current->thread.regs->msr & MSR_VEC)) {
+			enable_kernel_altivec();
+			load_vr_state(&vcpu->arch.vr);
+			disable_kernel_altivec();
+			current->thread.vr_save_area = &vcpu->arch.vr;
+			current->thread.regs->msr |= MSR_VEC;
+		}
+	}
+#endif
+}
+
+/*
+ * Save guest vcpu AltiVec state into thread.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_save_guest_altivec(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
+		current->thread.vr_save_area = NULL;
+	}
+#endif
+}
+
+static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu)
+{
+	/* Synchronize guest's desire to get debug interrupts into shadow MSR */
+#ifndef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_msr &= ~MSR_DE;
+	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE;
+#endif
+
+	/* Force enable debug interrupts when user space wants to debug */
+	if (vcpu->guest_debug) {
+#ifdef CONFIG_KVM_BOOKE_HV
+		/*
+		 * Since there is no shadow MSR, sync MSR_DE into the guest
+		 * visible MSR.
+		 */
+		vcpu->arch.shared->msr |= MSR_DE;
+#else
+		vcpu->arch.shadow_msr |= MSR_DE;
+		vcpu->arch.shared->msr &= ~MSR_DE;
+#endif
+	}
+}
+
+/*
+ * Helper function for "full" MSR writes.  No need to call this if only
+ * EE/CE/ME/DE/RI are changing.
+ */
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+	u32 old_msr = vcpu->arch.shared->msr;
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	new_msr |= MSR_GS;
+#endif
+
+	vcpu->arch.shared->msr = new_msr;
+
+	kvmppc_mmu_msr_notify(vcpu, old_msr);
+	kvmppc_vcpu_sync_spe(vcpu);
+	kvmppc_vcpu_sync_fpu(vcpu);
+	kvmppc_vcpu_sync_debug(vcpu);
+}
+
+static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
+                                       unsigned int priority)
+{
+	trace_kvm_booke_queue_irqprio(vcpu, priority);
+	set_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu,
+				 ulong dear_flags, ulong esr_flags)
+{
+	vcpu->arch.queued_dear = dear_flags;
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
+}
+
+void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong srr1_flags,
+				    ulong dear_flags, ulong esr_flags)
+{
+	WARN_ON_ONCE(srr1_flags);
+	vcpu->arch.queued_dear = dear_flags;
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+}
+
+void kvmppc_core_queue_itlb_miss(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
+}
+
+void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong esr_flags)
+{
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+}
+
+static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags,
+					ulong esr_flags)
+{
+	vcpu->arch.queued_dear = dear_flags;
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT);
+}
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
+{
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
+}
+
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	WARN_ON_ONCE(srr1_flags);
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+}
+
+#ifdef CONFIG_ALTIVEC
+void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu, ulong srr1_flags)
+{
+	WARN_ON_ONCE(srr1_flags);
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL);
+}
+#endif
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
+}
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+	return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+	unsigned int prio = BOOKE_IRQPRIO_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL;
+
+	kvmppc_booke_queue_irqprio(vcpu, prio);
+}
+
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+}
+
+static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DEBUG);
+}
+
+void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_DEBUG, &vcpu->arch.pending_exceptions);
+}
+
+static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	kvmppc_set_srr0(vcpu, srr0);
+	kvmppc_set_srr1(vcpu, srr1);
+}
+
+static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.csrr0 = srr0;
+	vcpu->arch.csrr1 = srr1;
+}
+
+static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) {
+		vcpu->arch.dsrr0 = srr0;
+		vcpu->arch.dsrr1 = srr1;
+	} else {
+		set_guest_csrr(vcpu, srr0, srr1);
+	}
+}
+
+static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.mcsrr0 = srr0;
+	vcpu->arch.mcsrr1 = srr1;
+}
+
+/* Deliver the interrupt of the corresponding priority, if possible. */
+static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
+{
+	int allowed = 0;
+	ulong msr_mask = 0;
+	bool update_esr = false, update_dear = false, update_epr = false;
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+	bool keep_irq = false;
+	enum int_class int_class;
+	ulong new_msr = vcpu->arch.shared->msr;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+	if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) {
+		priority = BOOKE_IRQPRIO_EXTERNAL;
+		keep_irq = true;
+	}
+
+	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
+		update_epr = true;
+
+	switch (priority) {
+	case BOOKE_IRQPRIO_DTLB_MISS:
+	case BOOKE_IRQPRIO_DATA_STORAGE:
+	case BOOKE_IRQPRIO_ALIGNMENT:
+		update_dear = true;
+		fallthrough;
+	case BOOKE_IRQPRIO_INST_STORAGE:
+	case BOOKE_IRQPRIO_PROGRAM:
+		update_esr = true;
+		fallthrough;
+	case BOOKE_IRQPRIO_ITLB_MISS:
+	case BOOKE_IRQPRIO_SYSCALL:
+	case BOOKE_IRQPRIO_FP_UNAVAIL:
+#ifdef CONFIG_SPE_POSSIBLE
+	case BOOKE_IRQPRIO_SPE_UNAVAIL:
+	case BOOKE_IRQPRIO_SPE_FP_DATA:
+	case BOOKE_IRQPRIO_SPE_FP_ROUND:
+#endif
+#ifdef CONFIG_ALTIVEC
+	case BOOKE_IRQPRIO_ALTIVEC_UNAVAIL:
+	case BOOKE_IRQPRIO_ALTIVEC_ASSIST:
+#endif
+	case BOOKE_IRQPRIO_AP_UNAVAIL:
+		allowed = 1;
+		msr_mask = MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
+		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
+	case BOOKE_IRQPRIO_CRITICAL:
+	case BOOKE_IRQPRIO_DBELL_CRIT:
+		allowed = vcpu->arch.shared->msr & MSR_CE;
+		allowed = allowed && !crit;
+		msr_mask = MSR_ME;
+		int_class = INT_CLASS_CRIT;
+		break;
+	case BOOKE_IRQPRIO_MACHINE_CHECK:
+		allowed = vcpu->arch.shared->msr & MSR_ME;
+		allowed = allowed && !crit;
+		int_class = INT_CLASS_MC;
+		break;
+	case BOOKE_IRQPRIO_DECREMENTER:
+	case BOOKE_IRQPRIO_FIT:
+		keep_irq = true;
+		fallthrough;
+	case BOOKE_IRQPRIO_EXTERNAL:
+	case BOOKE_IRQPRIO_DBELL:
+		allowed = vcpu->arch.shared->msr & MSR_EE;
+		allowed = allowed && !crit;
+		msr_mask = MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
+		break;
+	case BOOKE_IRQPRIO_DEBUG:
+		allowed = vcpu->arch.shared->msr & MSR_DE;
+		allowed = allowed && !crit;
+		msr_mask = MSR_ME;
+		if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
+			int_class = INT_CLASS_DBG;
+		else
+			int_class = INT_CLASS_CRIT;
+
+		break;
+	}
+
+	if (allowed) {
+		switch (int_class) {
+		case INT_CLASS_NONCRIT:
+			set_guest_srr(vcpu, vcpu->arch.regs.nip,
+				      vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_CRIT:
+			set_guest_csrr(vcpu, vcpu->arch.regs.nip,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_DBG:
+			set_guest_dsrr(vcpu, vcpu->arch.regs.nip,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_MC:
+			set_guest_mcsrr(vcpu, vcpu->arch.regs.nip,
+					vcpu->arch.shared->msr);
+			break;
+		}
+
+		vcpu->arch.regs.nip = vcpu->arch.ivpr |
+					vcpu->arch.ivor[priority];
+		if (update_esr)
+			kvmppc_set_esr(vcpu, vcpu->arch.queued_esr);
+		if (update_dear)
+			kvmppc_set_dar(vcpu, vcpu->arch.queued_dear);
+		if (update_epr) {
+			if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+				kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+			else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
+				BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC);
+				kvmppc_mpic_set_epr(vcpu);
+			}
+		}
+
+		new_msr &= msr_mask;
+#if defined(CONFIG_64BIT)
+		if (vcpu->arch.epcr & SPRN_EPCR_ICM)
+			new_msr |= MSR_CM;
+#endif
+		kvmppc_set_msr(vcpu, new_msr);
+
+		if (!keep_irq)
+			clear_bit(priority, &vcpu->arch.pending_exceptions);
+	}
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/*
+	 * If an interrupt is pending but masked, raise a guest doorbell
+	 * so that we are notified when the guest enables the relevant
+	 * MSR bit.
+	 */
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC);
+#endif
+
+	return allowed;
+}
+
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	u64 tb, wdt_tb, wdt_ticks = 0;
+	u64 nr_jiffies = 0;
+	u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+	wdt_tb = 1ULL << (63 - period);
+	tb = get_tb();
+	/*
+	 * The watchdog timeout will hapeen when TB bit corresponding
+	 * to watchdog will toggle from 0 to 1.
+	 */
+	if (tb & wdt_tb)
+		wdt_ticks = wdt_tb;
+
+	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+	/* Convert timebase ticks to jiffies */
+	nr_jiffies = wdt_ticks;
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+	unsigned long flags;
+
+	/*
+	 * If TSR_ENW and TSR_WIS are not set then no need to exit to
+	 * userspace, so clear the KVM_REQ_WATCHDOG request.
+	 */
+	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
+		kvm_clear_request(KVM_REQ_WATCHDOG, vcpu);
+
+	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	/*
+	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
+	 * then do not run the watchdog timer as this can break timer APIs.
+	 */
+	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
+}
+
+static void kvmppc_watchdog_func(struct timer_list *t)
+{
+	struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.wdt_timer);
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS)
+				final = 1;
+			else
+				new_tsr = tsr | TSR_WIS;
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & TSR_WIS) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * If this is final watchdog expiry and some action is required
+	 * then exit to userspace.
+	 */
+	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
+	    vcpu->arch.watchdog_enabled) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Stop running the watchdog timer after final expiration to
+	 * prevent the host from being flooded with timers if the
+	 * guest sets a short period.
+	 * Timers will resume when TSR/TCR is updated next time.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
+
+static void update_timer_ints(struct kvm_vcpu *vcpu)
+{
+	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
+		kvmppc_core_queue_dec(vcpu);
+	else
+		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
+}
+
+static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
+{
+	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned int priority;
+
+	priority = __ffs(*pending);
+	while (priority < BOOKE_IRQPRIO_MAX) {
+		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
+			break;
+
+		priority = find_next_bit(pending,
+		                         BITS_PER_BYTE * sizeof(*pending),
+		                         priority + 1);
+	}
+
+	/* Tell the guest about our interrupt status */
+	vcpu->arch.shared->int_pending = !!*pending;
+}
+
+/* Check pending exceptions and deliver one, if possible. */
+int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+	WARN_ON_ONCE(!irqs_disabled());
+
+	kvmppc_core_check_exceptions(vcpu);
+
+	if (kvm_request_pending(vcpu)) {
+		/* Exception delivery raised request; start over */
+		return 1;
+	}
+
+	if (vcpu->arch.shared->msr & MSR_WE) {
+		local_irq_enable();
+		kvm_vcpu_halt(vcpu);
+		hard_irq_disable();
+
+		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+		r = 1;
+	}
+
+	return r;
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+	int r = 1; /* Indicate we want to get back into the guest */
+
+	if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu))
+		update_timer_ints(vcpu);
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		kvmppc_core_flush_tlb(vcpu);
+#endif
+
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) {
+		vcpu->run->exit_reason = KVM_EXIT_WATCHDOG;
+		r = 0;
+	}
+
+	if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) {
+		vcpu->run->epr.epr = 0;
+		vcpu->arch.epr_needed = true;
+		vcpu->run->exit_reason = KVM_EXIT_EPR;
+		r = 0;
+	}
+
+	return r;
+}
+
+int kvmppc_vcpu_run(struct kvm_vcpu *vcpu)
+{
+	int ret, s;
+	struct debug_reg debug;
+
+	if (!vcpu->arch.sane) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
+	s = kvmppc_prepare_to_enter(vcpu);
+	if (s <= 0) {
+		ret = s;
+		goto out;
+	}
+	/* interrupts now hard-disabled */
+
+#ifdef CONFIG_PPC_FPU
+	/* Save userspace FPU state in stack */
+	enable_kernel_fp();
+
+	/*
+	 * Since we can't trap on MSR_FP in GS-mode, we consider the guest
+	 * as always using the FPU.
+	 */
+	kvmppc_load_guest_fp(vcpu);
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	/* Save userspace AltiVec state in stack */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		enable_kernel_altivec();
+	/*
+	 * Since we can't trap on MSR_VEC in GS-mode, we consider the guest
+	 * as always using the AltiVec.
+	 */
+	kvmppc_load_guest_altivec(vcpu);
+#endif
+
+	/* Switch to guest debug context */
+	debug = vcpu->arch.dbg_reg;
+	switch_booke_debug_regs(&debug);
+	debug = current->thread.debug;
+	current->thread.debug = vcpu->arch.dbg_reg;
+
+	vcpu->arch.pgdir = vcpu->kvm->mm->pgd;
+	kvmppc_fix_ee_before_entry();
+
+	ret = __kvmppc_vcpu_run(vcpu);
+
+	/* No need for guest_exit. It's done in handle_exit.
+	   We also get here with interrupts enabled. */
+
+	/* Switch back to user space debug context */
+	switch_booke_debug_regs(&debug);
+	current->thread.debug = debug;
+
+#ifdef CONFIG_PPC_FPU
+	kvmppc_save_guest_fp(vcpu);
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	kvmppc_save_guest_altivec(vcpu);
+#endif
+
+out:
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	return ret;
+}
+
+static int emulation_exit(struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+
+	er = kvmppc_emulate_instruction(vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		/* don't overwrite subtypes, just account kvm_stats */
+		kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
+		/* Future optimization: only reload non-volatiles if
+		 * they were actually modified by emulation. */
+		return RESUME_GUEST_NV;
+
+	case EMULATE_AGAIN:
+		return RESUME_GUEST;
+
+	case EMULATE_FAIL:
+		printk(KERN_CRIT "%s: emulation at %lx failed (%08lx)\n",
+		       __func__, vcpu->arch.regs.nip, vcpu->arch.last_inst);
+		/* For debugging, encode the failing instruction and
+		 * report it to userspace. */
+		vcpu->run->hw.hardware_exit_reason = ~0ULL << 32;
+		vcpu->run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
+		kvmppc_core_queue_program(vcpu, ESR_PIL);
+		return RESUME_HOST;
+
+	case EMULATE_EXIT_USER:
+		return RESUME_HOST;
+
+	default:
+		BUG();
+	}
+}
+
+static int kvmppc_handle_debug(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct debug_reg *dbg_reg = &(vcpu->arch.dbg_reg);
+	u32 dbsr = vcpu->arch.dbsr;
+
+	if (vcpu->guest_debug == 0) {
+		/*
+		 * Debug resources belong to Guest.
+		 * Imprecise debug event is not injected
+		 */
+		if (dbsr & DBSR_IDE) {
+			dbsr &= ~DBSR_IDE;
+			if (!dbsr)
+				return RESUME_GUEST;
+		}
+
+		if (dbsr && (vcpu->arch.shared->msr & MSR_DE) &&
+			    (vcpu->arch.dbg_reg.dbcr0 & DBCR0_IDM))
+			kvmppc_core_queue_debug(vcpu);
+
+		/* Inject a program interrupt if trap debug is not allowed */
+		if ((dbsr & DBSR_TIE) && !(vcpu->arch.shared->msr & MSR_DE))
+			kvmppc_core_queue_program(vcpu, ESR_PTR);
+
+		return RESUME_GUEST;
+	}
+
+	/*
+	 * Debug resource owned by userspace.
+	 * Clear guest dbsr (vcpu->arch.dbsr)
+	 */
+	vcpu->arch.dbsr = 0;
+	run->debug.arch.status = 0;
+	run->debug.arch.address = vcpu->arch.regs.nip;
+
+	if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) {
+		run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT;
+	} else {
+		if (dbsr & (DBSR_DAC1W | DBSR_DAC2W))
+			run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE;
+		else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R))
+			run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ;
+		if (dbsr & (DBSR_DAC1R | DBSR_DAC1W))
+			run->debug.arch.address = dbg_reg->dac1;
+		else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W))
+			run->debug.arch.address = dbg_reg->dac2;
+	}
+
+	return RESUME_HOST;
+}
+
+static void kvmppc_fill_pt_regs(struct pt_regs *regs)
+{
+	ulong r1, msr, lr;
+
+	asm("mr %0, 1" : "=r"(r1));
+	asm("mflr %0" : "=r"(lr));
+	asm("mfmsr %0" : "=r"(msr));
+
+	memset(regs, 0, sizeof(*regs));
+	regs->gpr[1] = r1;
+	regs->nip = _THIS_IP_;
+	regs->msr = msr;
+	regs->link = lr;
+}
+
+/*
+ * For interrupts needed to be handled by host interrupt handlers,
+ * corresponding host handler are called from here in similar way
+ * (but not exact) as they are called from low level handler
+ * (such as from arch/powerpc/kernel/head_fsl_booke.S).
+ */
+static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
+				     unsigned int exit_nr)
+{
+	struct pt_regs regs;
+
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_EXTERNAL:
+		kvmppc_fill_pt_regs(&regs);
+		do_IRQ(&regs);
+		break;
+	case BOOKE_INTERRUPT_DECREMENTER:
+		kvmppc_fill_pt_regs(&regs);
+		timer_interrupt(&regs);
+		break;
+#if defined(CONFIG_PPC_DOORBELL)
+	case BOOKE_INTERRUPT_DOORBELL:
+		kvmppc_fill_pt_regs(&regs);
+		doorbell_exception(&regs);
+		break;
+#endif
+	case BOOKE_INTERRUPT_MACHINE_CHECK:
+		/* FIXME */
+		break;
+	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:
+		kvmppc_fill_pt_regs(&regs);
+		performance_monitor_exception(&regs);
+		break;
+	case BOOKE_INTERRUPT_WATCHDOG:
+		kvmppc_fill_pt_regs(&regs);
+#ifdef CONFIG_BOOKE_WDT
+		WatchdogException(&regs);
+#else
+		unknown_exception(&regs);
+#endif
+		break;
+	case BOOKE_INTERRUPT_CRITICAL:
+		kvmppc_fill_pt_regs(&regs);
+		unknown_exception(&regs);
+		break;
+	case BOOKE_INTERRUPT_DEBUG:
+		/* Save DBSR before preemption is enabled */
+		vcpu->arch.dbsr = mfspr(SPRN_DBSR);
+		kvmppc_clear_dbsr();
+		break;
+	}
+}
+
+static int kvmppc_resume_inst_load(struct kvm_vcpu *vcpu,
+				  enum emulation_result emulated, u32 last_inst)
+{
+	switch (emulated) {
+	case EMULATE_AGAIN:
+		return RESUME_GUEST;
+
+	case EMULATE_FAIL:
+		pr_debug("%s: load instruction from guest address %lx failed\n",
+		       __func__, vcpu->arch.regs.nip);
+		/* For debugging, encode the failing instruction and
+		 * report it to userspace. */
+		vcpu->run->hw.hardware_exit_reason = ~0ULL << 32;
+		vcpu->run->hw.hardware_exit_reason |= last_inst;
+		kvmppc_core_queue_program(vcpu, ESR_PIL);
+		return RESUME_HOST;
+
+	default:
+		BUG();
+	}
+}
+
+/*
+ * kvmppc_handle_exit
+ *
+ * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
+ */
+int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+	struct kvm_run *run = vcpu->run;
+	int r = RESUME_HOST;
+	int s;
+	int idx;
+	u32 last_inst = KVM_INST_FETCH_FAILED;
+	ppc_inst_t pinst;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	/* Fix irq state (pairs with kvmppc_fix_ee_before_entry()) */
+	kvmppc_fix_ee_after_exit();
+
+	/* update before a new last_exit_type is rewritten */
+	kvmppc_update_timing_stats(vcpu);
+
+	/* restart interrupts if they were meant for the host */
+	kvmppc_restart_interrupt(vcpu, exit_nr);
+
+	/*
+	 * get last instruction before being preempted
+	 * TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR & ESR_DATA
+	 */
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_DATA_STORAGE:
+	case BOOKE_INTERRUPT_DTLB_MISS:
+	case BOOKE_INTERRUPT_HV_PRIV:
+		emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst);
+		last_inst = ppc_inst_val(pinst);
+		break;
+	case BOOKE_INTERRUPT_PROGRAM:
+		/* SW breakpoints arrive as illegal instructions on HV */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
+			emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst);
+			last_inst = ppc_inst_val(pinst);
+		}
+		break;
+	default:
+		break;
+	}
+
+	trace_kvm_exit(exit_nr, vcpu);
+
+	context_tracking_guest_exit();
+	if (!vtime_accounting_enabled_this_cpu()) {
+		local_irq_enable();
+		/*
+		 * Service IRQs here before vtime_account_guest_exit() so any
+		 * ticks that occurred while running the guest are accounted to
+		 * the guest. If vtime accounting is enabled, accounting uses
+		 * TB rather than ticks, so it can be done without enabling
+		 * interrupts here, which has the problem that it accounts
+		 * interrupt processing overhead to the host.
+		 */
+		local_irq_disable();
+	}
+	vtime_account_guest_exit();
+
+	local_irq_enable();
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+
+	if (emulated != EMULATE_DONE) {
+		r = kvmppc_resume_inst_load(vcpu, emulated, last_inst);
+		goto out;
+	}
+
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_MACHINE_CHECK:
+		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
+		kvmppc_dump_vcpu(vcpu);
+		/* For debugging, send invalid exit reason to user space */
+		run->hw.hardware_exit_reason = ~1ULL << 32;
+		run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR);
+		r = RESUME_HOST;
+		break;
+
+	case BOOKE_INTERRUPT_EXTERNAL:
+		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_DECREMENTER:
+		kvmppc_account_exit(vcpu, DEC_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_WATCHDOG:
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_DOORBELL:
+		kvmppc_account_exit(vcpu, DBELL_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL_CRIT:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_CE or MSR_ME was not
+		 * set.  Once we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_EE was not set.  Once
+		 * we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_HV_PRIV:
+		r = emulation_exit(vcpu);
+		break;
+
+	case BOOKE_INTERRUPT_PROGRAM:
+		if ((vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) &&
+			(last_inst == KVMPPC_INST_SW_BREAKPOINT)) {
+			/*
+			 * We are here because of an SW breakpoint instr,
+			 * so lets return to host to handle.
+			 */
+			r = kvmppc_handle_debug(vcpu);
+			run->exit_reason = KVM_EXIT_DEBUG;
+			kvmppc_account_exit(vcpu, DEBUG_EXITS);
+			break;
+		}
+
+		if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) {
+			/*
+			 * Program traps generated by user-level software must
+			 * be handled by the guest kernel.
+			 *
+			 * In GS mode, hypervisor privileged instructions trap
+			 * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are
+			 * actual program interrupts, handled by the guest.
+			 */
+			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
+			r = RESUME_GUEST;
+			kvmppc_account_exit(vcpu, USR_PR_INST);
+			break;
+		}
+
+		r = emulation_exit(vcpu);
+		break;
+
+	case BOOKE_INTERRUPT_FP_UNAVAIL:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+		kvmppc_account_exit(vcpu, FP_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+
+#ifdef CONFIG_SPE
+	case BOOKE_INTERRUPT_SPE_UNAVAIL: {
+		if (vcpu->arch.shared->msr & MSR_SPE)
+			kvmppc_vcpu_enable_spe(vcpu);
+		else
+			kvmppc_booke_queue_irqprio(vcpu,
+						   BOOKE_IRQPRIO_SPE_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+	}
+
+	case BOOKE_INTERRUPT_SPE_FP_DATA:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_SPE_FP_ROUND:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
+		r = RESUME_GUEST;
+		break;
+#elif defined(CONFIG_SPE_POSSIBLE)
+	case BOOKE_INTERRUPT_SPE_UNAVAIL:
+		/*
+		 * Guest wants SPE, but host kernel doesn't support it.  Send
+		 * an "unimplemented operation" program check to the guest.
+		 */
+		kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV);
+		r = RESUME_GUEST;
+		break;
+
+	/*
+	 * These really should never happen without CONFIG_SPE,
+	 * as we should never enable the real MSR[SPE] in the guest.
+	 */
+	case BOOKE_INTERRUPT_SPE_FP_DATA:
+	case BOOKE_INTERRUPT_SPE_FP_ROUND:
+		printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n",
+		       __func__, exit_nr, vcpu->arch.regs.nip);
+		run->hw.hardware_exit_reason = exit_nr;
+		r = RESUME_HOST;
+		break;
+#endif /* CONFIG_SPE_POSSIBLE */
+
+/*
+ * On cores with Vector category, KVM is loaded only if CONFIG_ALTIVEC,
+ * see kvmppc_e500mc_check_processor_compat().
+ */
+#ifdef CONFIG_ALTIVEC
+	case BOOKE_INTERRUPT_ALTIVEC_UNAVAIL:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_ALTIVEC_ASSIST:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_ASSIST);
+		r = RESUME_GUEST;
+		break;
+#endif
+
+	case BOOKE_INTERRUPT_DATA_STORAGE:
+		kvmppc_core_queue_data_storage(vcpu, 0, vcpu->arch.fault_dear,
+		                               vcpu->arch.fault_esr);
+		kvmppc_account_exit(vcpu, DSI_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_INST_STORAGE:
+		kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr);
+		kvmppc_account_exit(vcpu, ISI_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_ALIGNMENT:
+		kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear,
+		                            vcpu->arch.fault_esr);
+		r = RESUME_GUEST;
+		break;
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	case BOOKE_INTERRUPT_HV_SYSCALL:
+		if (!(vcpu->arch.shared->msr & MSR_PR)) {
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+		} else {
+			/*
+			 * hcall from guest userspace -- send privileged
+			 * instruction program check.
+			 */
+			kvmppc_core_queue_program(vcpu, ESR_PPR);
+		}
+
+		r = RESUME_GUEST;
+		break;
+#else
+	case BOOKE_INTERRUPT_SYSCALL:
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+		}
+		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
+		r = RESUME_GUEST;
+		break;
+#endif
+
+	case BOOKE_INTERRUPT_DTLB_MISS: {
+		unsigned long eaddr = vcpu->arch.fault_dear;
+		int gtlb_index;
+		gpa_t gpaddr;
+		gfn_t gfn;
+
+#ifdef CONFIG_KVM_E500V2
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
+			kvmppc_map_magic(vcpu);
+			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+			r = RESUME_GUEST;
+
+			break;
+		}
+#endif
+
+		/* Check the guest TLB. */
+		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
+		if (gtlb_index < 0) {
+			/* The guest didn't have a mapping for it. */
+			kvmppc_core_queue_dtlb_miss(vcpu,
+			                            vcpu->arch.fault_dear,
+			                            vcpu->arch.fault_esr);
+			kvmppc_mmu_dtlb_miss(vcpu);
+			kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
+			r = RESUME_GUEST;
+			break;
+		}
+
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+		gfn = gpaddr >> PAGE_SHIFT;
+
+		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+			/* The guest TLB had a mapping, but the shadow TLB
+			 * didn't, and it is RAM. This could be because:
+			 * a) the entry is mapping the host kernel, or
+			 * b) the guest used a large mapping which we're faking
+			 * Either way, we need to satisfy the fault without
+			 * invoking the guest. */
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
+			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+			r = RESUME_GUEST;
+		} else {
+			/* Guest has mapped and accessed a page which is not
+			 * actually RAM. */
+			vcpu->arch.paddr_accessed = gpaddr;
+			vcpu->arch.vaddr_accessed = eaddr;
+			r = kvmppc_emulate_mmio(vcpu);
+			kvmppc_account_exit(vcpu, MMIO_EXITS);
+		}
+
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		break;
+	}
+
+	case BOOKE_INTERRUPT_ITLB_MISS: {
+		unsigned long eaddr = vcpu->arch.regs.nip;
+		gpa_t gpaddr;
+		gfn_t gfn;
+		int gtlb_index;
+
+		r = RESUME_GUEST;
+
+		/* Check the guest TLB. */
+		gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr);
+		if (gtlb_index < 0) {
+			/* The guest didn't have a mapping for it. */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
+			kvmppc_mmu_itlb_miss(vcpu);
+			kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
+			break;
+		}
+
+		kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
+
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+		gfn = gpaddr >> PAGE_SHIFT;
+
+		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+			/* The guest TLB had a mapping, but the shadow TLB
+			 * didn't. This could be because:
+			 * a) the entry is mapping the host kernel, or
+			 * b) the guest used a large mapping which we're faking
+			 * Either way, we need to satisfy the fault without
+			 * invoking the guest. */
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
+		} else {
+			/* Guest mapped and leaped at non-RAM! */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
+		}
+
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		break;
+	}
+
+	case BOOKE_INTERRUPT_DEBUG: {
+		r = kvmppc_handle_debug(vcpu);
+		if (r == RESUME_HOST)
+			run->exit_reason = KVM_EXIT_DEBUG;
+		kvmppc_account_exit(vcpu, DEBUG_EXITS);
+		break;
+	}
+
+	default:
+		printk(KERN_EMERG "exit_nr %d\n", exit_nr);
+		BUG();
+	}
+
+out:
+	/*
+	 * To avoid clobbering exit_reason, only check for signals if we
+	 * aren't already exiting to userspace for some other reason.
+	 */
+	if (!(r & RESUME_HOST)) {
+		s = kvmppc_prepare_to_enter(vcpu);
+		if (s <= 0)
+			r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+		else {
+			/* interrupts now hard-disabled */
+			kvmppc_fix_ee_before_entry();
+			kvmppc_load_guest_fp(vcpu);
+			kvmppc_load_guest_altivec(vcpu);
+		}
+	}
+
+	return r;
+}
+
+static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr)
+{
+	u32 old_tsr = vcpu->arch.tsr;
+
+	vcpu->arch.tsr = new_tsr;
+
+	if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
+	update_timer_ints(vcpu);
+}
+
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	/* setup watchdog timer once */
+	spin_lock_init(&vcpu->arch.wdt_lock);
+	timer_setup(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, 0);
+
+	/*
+	 * Clear DBSR.MRR to avoid guest debug interrupt as
+	 * this is of host interest
+	 */
+	mtspr(SPRN_DBSR, DBSR_MRR);
+	return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	del_timer_sync(&vcpu->arch.wdt_timer);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	vcpu_load(vcpu);
+
+	regs->pc = vcpu->arch.regs.nip;
+	regs->cr = kvmppc_get_cr(vcpu);
+	regs->ctr = vcpu->arch.regs.ctr;
+	regs->lr = vcpu->arch.regs.link;
+	regs->xer = kvmppc_get_xer(vcpu);
+	regs->msr = vcpu->arch.shared->msr;
+	regs->srr0 = kvmppc_get_srr0(vcpu);
+	regs->srr1 = kvmppc_get_srr1(vcpu);
+	regs->pid = vcpu->arch.pid;
+	regs->sprg0 = kvmppc_get_sprg0(vcpu);
+	regs->sprg1 = kvmppc_get_sprg1(vcpu);
+	regs->sprg2 = kvmppc_get_sprg2(vcpu);
+	regs->sprg3 = kvmppc_get_sprg3(vcpu);
+	regs->sprg4 = kvmppc_get_sprg4(vcpu);
+	regs->sprg5 = kvmppc_get_sprg5(vcpu);
+	regs->sprg6 = kvmppc_get_sprg6(vcpu);
+	regs->sprg7 = kvmppc_get_sprg7(vcpu);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
+
+	vcpu_put(vcpu);
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	vcpu_load(vcpu);
+
+	vcpu->arch.regs.nip = regs->pc;
+	kvmppc_set_cr(vcpu, regs->cr);
+	vcpu->arch.regs.ctr = regs->ctr;
+	vcpu->arch.regs.link = regs->lr;
+	kvmppc_set_xer(vcpu, regs->xer);
+	kvmppc_set_msr(vcpu, regs->msr);
+	kvmppc_set_srr0(vcpu, regs->srr0);
+	kvmppc_set_srr1(vcpu, regs->srr1);
+	kvmppc_set_pid(vcpu, regs->pid);
+	kvmppc_set_sprg0(vcpu, regs->sprg0);
+	kvmppc_set_sprg1(vcpu, regs->sprg1);
+	kvmppc_set_sprg2(vcpu, regs->sprg2);
+	kvmppc_set_sprg3(vcpu, regs->sprg3);
+	kvmppc_set_sprg4(vcpu, regs->sprg4);
+	kvmppc_set_sprg5(vcpu, regs->sprg5);
+	kvmppc_set_sprg6(vcpu, regs->sprg6);
+	kvmppc_set_sprg7(vcpu, regs->sprg7);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
+
+	vcpu_put(vcpu);
+	return 0;
+}
+
+static void get_sregs_base(struct kvm_vcpu *vcpu,
+                           struct kvm_sregs *sregs)
+{
+	u64 tb = get_tb();
+
+	sregs->u.e.features |= KVM_SREGS_E_BASE;
+
+	sregs->u.e.csrr0 = vcpu->arch.csrr0;
+	sregs->u.e.csrr1 = vcpu->arch.csrr1;
+	sregs->u.e.mcsr = vcpu->arch.mcsr;
+	sregs->u.e.esr = kvmppc_get_esr(vcpu);
+	sregs->u.e.dear = kvmppc_get_dar(vcpu);
+	sregs->u.e.tsr = vcpu->arch.tsr;
+	sregs->u.e.tcr = vcpu->arch.tcr;
+	sregs->u.e.dec = kvmppc_get_dec(vcpu, tb);
+	sregs->u.e.tb = tb;
+	sregs->u.e.vrsave = vcpu->arch.vrsave;
+}
+
+static int set_sregs_base(struct kvm_vcpu *vcpu,
+                          struct kvm_sregs *sregs)
+{
+	if (!(sregs->u.e.features & KVM_SREGS_E_BASE))
+		return 0;
+
+	vcpu->arch.csrr0 = sregs->u.e.csrr0;
+	vcpu->arch.csrr1 = sregs->u.e.csrr1;
+	vcpu->arch.mcsr = sregs->u.e.mcsr;
+	kvmppc_set_esr(vcpu, sregs->u.e.esr);
+	kvmppc_set_dar(vcpu, sregs->u.e.dear);
+	vcpu->arch.vrsave = sregs->u.e.vrsave;
+	kvmppc_set_tcr(vcpu, sregs->u.e.tcr);
+
+	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) {
+		vcpu->arch.dec = sregs->u.e.dec;
+		kvmppc_emulate_dec(vcpu);
+	}
+
+	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR)
+		kvmppc_set_tsr(vcpu, sregs->u.e.tsr);
+
+	return 0;
+}
+
+static void get_sregs_arch206(struct kvm_vcpu *vcpu,
+                              struct kvm_sregs *sregs)
+{
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206;
+
+	sregs->u.e.pir = vcpu->vcpu_id;
+	sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0;
+	sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1;
+	sregs->u.e.decar = vcpu->arch.decar;
+	sregs->u.e.ivpr = vcpu->arch.ivpr;
+}
+
+static int set_sregs_arch206(struct kvm_vcpu *vcpu,
+                             struct kvm_sregs *sregs)
+{
+	if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206))
+		return 0;
+
+	if (sregs->u.e.pir != vcpu->vcpu_id)
+		return -EINVAL;
+
+	vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0;
+	vcpu->arch.mcsrr1 = sregs->u.e.mcsrr1;
+	vcpu->arch.decar = sregs->u.e.decar;
+	vcpu->arch.ivpr = sregs->u.e.ivpr;
+
+	return 0;
+}
+
+int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	sregs->u.e.features |= KVM_SREGS_E_IVOR;
+
+	sregs->u.e.ivor_low[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+	sregs->u.e.ivor_low[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+	sregs->u.e.ivor_low[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+	sregs->u.e.ivor_low[3] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+	sregs->u.e.ivor_low[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+	sregs->u.e.ivor_low[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+	sregs->u.e.ivor_low[6] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+	sregs->u.e.ivor_low[7] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+	sregs->u.e.ivor_low[8] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+	sregs->u.e.ivor_low[9] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+	sregs->u.e.ivor_low[10] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+	sregs->u.e.ivor_low[11] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+	sregs->u.e.ivor_low[12] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+	sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+	sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+	sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+	return 0;
+}
+
+int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = sregs->u.e.ivor_low[0];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = sregs->u.e.ivor_low[1];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = sregs->u.e.ivor_low[2];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = sregs->u.e.ivor_low[3];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = sregs->u.e.ivor_low[4];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = sregs->u.e.ivor_low[5];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = sregs->u.e.ivor_low[6];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = sregs->u.e.ivor_low[7];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = sregs->u.e.ivor_low[8];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = sregs->u.e.ivor_low[9];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = sregs->u.e.ivor_low[10];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = sregs->u.e.ivor_low[11];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = sregs->u.e.ivor_low[12];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = sregs->u.e.ivor_low[13];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = sregs->u.e.ivor_low[14];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = sregs->u.e.ivor_low[15];
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int ret;
+
+	vcpu_load(vcpu);
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	get_sregs_base(vcpu, sregs);
+	get_sregs_arch206(vcpu, sregs);
+	ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+
+	vcpu_put(vcpu);
+	return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int ret = -EINVAL;
+
+	vcpu_load(vcpu);
+	if (vcpu->arch.pvr != sregs->pvr)
+		goto out;
+
+	ret = set_sregs_base(vcpu, sregs);
+	if (ret < 0)
+		goto out;
+
+	ret = set_sregs_arch206(vcpu, sregs);
+	if (ret < 0)
+		goto out;
+
+	ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+
+out:
+	vcpu_put(vcpu);
+	return ret;
+}
+
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_IAC1:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac1);
+		break;
+	case KVM_REG_PPC_IAC2:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac2);
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case KVM_REG_PPC_IAC3:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac3);
+		break;
+	case KVM_REG_PPC_IAC4:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac4);
+		break;
+#endif
+	case KVM_REG_PPC_DAC1:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.dac1);
+		break;
+	case KVM_REG_PPC_DAC2:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.dac2);
+		break;
+	case KVM_REG_PPC_EPR: {
+		u32 epr = kvmppc_get_epr(vcpu);
+		*val = get_reg_val(id, epr);
+		break;
+	}
+#if defined(CONFIG_64BIT)
+	case KVM_REG_PPC_EPCR:
+		*val = get_reg_val(id, vcpu->arch.epcr);
+		break;
+#endif
+	case KVM_REG_PPC_TCR:
+		*val = get_reg_val(id, vcpu->arch.tcr);
+		break;
+	case KVM_REG_PPC_TSR:
+		*val = get_reg_val(id, vcpu->arch.tsr);
+		break;
+	case KVM_REG_PPC_DEBUG_INST:
+		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
+		break;
+	case KVM_REG_PPC_VRSAVE:
+		*val = get_reg_val(id, vcpu->arch.vrsave);
+		break;
+	default:
+		r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val);
+		break;
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_IAC1:
+		vcpu->arch.dbg_reg.iac1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_IAC2:
+		vcpu->arch.dbg_reg.iac2 = set_reg_val(id, *val);
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case KVM_REG_PPC_IAC3:
+		vcpu->arch.dbg_reg.iac3 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_IAC4:
+		vcpu->arch.dbg_reg.iac4 = set_reg_val(id, *val);
+		break;
+#endif
+	case KVM_REG_PPC_DAC1:
+		vcpu->arch.dbg_reg.dac1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAC2:
+		vcpu->arch.dbg_reg.dac2 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_EPR: {
+		u32 new_epr = set_reg_val(id, *val);
+		kvmppc_set_epr(vcpu, new_epr);
+		break;
+	}
+#if defined(CONFIG_64BIT)
+	case KVM_REG_PPC_EPCR: {
+		u32 new_epcr = set_reg_val(id, *val);
+		kvmppc_set_epcr(vcpu, new_epcr);
+		break;
+	}
+#endif
+	case KVM_REG_PPC_OR_TSR: {
+		u32 tsr_bits = set_reg_val(id, *val);
+		kvmppc_set_tsr_bits(vcpu, tsr_bits);
+		break;
+	}
+	case KVM_REG_PPC_CLEAR_TSR: {
+		u32 tsr_bits = set_reg_val(id, *val);
+		kvmppc_clr_tsr_bits(vcpu, tsr_bits);
+		break;
+	}
+	case KVM_REG_PPC_TSR: {
+		u32 tsr = set_reg_val(id, *val);
+		kvmppc_set_tsr(vcpu, tsr);
+		break;
+	}
+	case KVM_REG_PPC_TCR: {
+		u32 tcr = set_reg_val(id, *val);
+		kvmppc_set_tcr(vcpu, tcr);
+		break;
+	}
+	case KVM_REG_PPC_VRSAVE:
+		vcpu->arch.vrsave = set_reg_val(id, *val);
+		break;
+	default:
+		r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val);
+		break;
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -EOPNOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -EOPNOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+	int r;
+
+	vcpu_load(vcpu);
+	r = kvmppc_core_vcpu_translate(vcpu, tr);
+	vcpu_put(vcpu);
+	return r;
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+	return -EOPNOTSUPP;
+}
+
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      const struct kvm_memory_slot *old,
+				      struct kvm_memory_slot *new,
+				      enum kvm_mr_change change)
+{
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
+				enum kvm_mr_change change)
+{
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+}
+
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr)
+{
+#if defined(CONFIG_64BIT)
+	vcpu->arch.epcr = new_epcr;
+#ifdef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM;
+	if (vcpu->arch.epcr  & SPRN_EPCR_ICM)
+		vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM;
+#endif
+#endif
+}
+
+void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
+{
+	vcpu->arch.tcr = new_tcr;
+	arm_next_watchdog(vcpu);
+	update_timer_ints(vcpu);
+}
+
+void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
+{
+	set_bits(tsr_bits, &vcpu->arch.tsr);
+	smp_wmb();
+	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
+{
+	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (tsr_bits & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
+	update_timer_ints(vcpu);
+}
+
+void kvmppc_decrementer_func(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.tcr & TCR_ARE) {
+		vcpu->arch.dec = vcpu->arch.decar;
+		kvmppc_emulate_dec(vcpu);
+	}
+
+	kvmppc_set_tsr_bits(vcpu, TSR_DIS);
+}
+
+static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg,
+				       uint64_t addr, int index)
+{
+	switch (index) {
+	case 0:
+		dbg_reg->dbcr0 |= DBCR0_IAC1;
+		dbg_reg->iac1 = addr;
+		break;
+	case 1:
+		dbg_reg->dbcr0 |= DBCR0_IAC2;
+		dbg_reg->iac2 = addr;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case 2:
+		dbg_reg->dbcr0 |= DBCR0_IAC3;
+		dbg_reg->iac3 = addr;
+		break;
+	case 3:
+		dbg_reg->dbcr0 |= DBCR0_IAC4;
+		dbg_reg->iac4 = addr;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	dbg_reg->dbcr0 |= DBCR0_IDM;
+	return 0;
+}
+
+static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr,
+				       int type, int index)
+{
+	switch (index) {
+	case 0:
+		if (type & KVMPPC_DEBUG_WATCH_READ)
+			dbg_reg->dbcr0 |= DBCR0_DAC1R;
+		if (type & KVMPPC_DEBUG_WATCH_WRITE)
+			dbg_reg->dbcr0 |= DBCR0_DAC1W;
+		dbg_reg->dac1 = addr;
+		break;
+	case 1:
+		if (type & KVMPPC_DEBUG_WATCH_READ)
+			dbg_reg->dbcr0 |= DBCR0_DAC2R;
+		if (type & KVMPPC_DEBUG_WATCH_WRITE)
+			dbg_reg->dbcr0 |= DBCR0_DAC2W;
+		dbg_reg->dac2 = addr;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	dbg_reg->dbcr0 |= DBCR0_IDM;
+	return 0;
+}
+static void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap,
+				  bool set)
+{
+	/* XXX: Add similar MSR protection for BookE-PR */
+#ifdef CONFIG_KVM_BOOKE_HV
+	BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP));
+	if (set) {
+		if (prot_bitmap & MSR_UCLE)
+			vcpu->arch.shadow_msrp |= MSRP_UCLEP;
+		if (prot_bitmap & MSR_DE)
+			vcpu->arch.shadow_msrp |= MSRP_DEP;
+		if (prot_bitmap & MSR_PMM)
+			vcpu->arch.shadow_msrp |= MSRP_PMMP;
+	} else {
+		if (prot_bitmap & MSR_UCLE)
+			vcpu->arch.shadow_msrp &= ~MSRP_UCLEP;
+		if (prot_bitmap & MSR_DE)
+			vcpu->arch.shadow_msrp &= ~MSRP_DEP;
+		if (prot_bitmap & MSR_PMM)
+			vcpu->arch.shadow_msrp &= ~MSRP_PMMP;
+	}
+#endif
+}
+
+int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid,
+		 enum xlate_readwrite xlrw, struct kvmppc_pte *pte)
+{
+	int gtlb_index;
+	gpa_t gpaddr;
+
+#ifdef CONFIG_KVM_E500V2
+	if (!(vcpu->arch.shared->msr & MSR_PR) &&
+	    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
+		pte->eaddr = eaddr;
+		pte->raddr = (vcpu->arch.magic_page_pa & PAGE_MASK) |
+			     (eaddr & ~PAGE_MASK);
+		pte->vpage = eaddr >> PAGE_SHIFT;
+		pte->may_read = true;
+		pte->may_write = true;
+		pte->may_execute = true;
+
+		return 0;
+	}
+#endif
+
+	/* Check the guest TLB. */
+	switch (xlid) {
+	case XLATE_INST:
+		gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr);
+		break;
+	case XLATE_DATA:
+		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
+		break;
+	default:
+		BUG();
+	}
+
+	/* Do we have a TLB entry at all? */
+	if (gtlb_index < 0)
+		return -ENOENT;
+
+	gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+
+	pte->eaddr = eaddr;
+	pte->raddr = (gpaddr & PAGE_MASK) | (eaddr & ~PAGE_MASK);
+	pte->vpage = eaddr >> PAGE_SHIFT;
+
+	/* XXX read permissions from the guest TLB */
+	pte->may_read = true;
+	pte->may_write = true;
+	pte->may_execute = true;
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					 struct kvm_guest_debug *dbg)
+{
+	struct debug_reg *dbg_reg;
+	int n, b = 0, w = 0;
+	int ret = 0;
+
+	vcpu_load(vcpu);
+
+	if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
+		vcpu->arch.dbg_reg.dbcr0 = 0;
+		vcpu->guest_debug = 0;
+		kvm_guest_protect_msr(vcpu, MSR_DE, false);
+		goto out;
+	}
+
+	kvm_guest_protect_msr(vcpu, MSR_DE, true);
+	vcpu->guest_debug = dbg->control;
+	vcpu->arch.dbg_reg.dbcr0 = 0;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vcpu->arch.dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+
+	/* Code below handles only HW breakpoints */
+	dbg_reg = &(vcpu->arch.dbg_reg);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/*
+	 * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1
+	 * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0
+	 */
+	dbg_reg->dbcr1 = 0;
+	dbg_reg->dbcr2 = 0;
+#else
+	/*
+	 * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1
+	 * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR
+	 * is set.
+	 */
+	dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US |
+			  DBCR1_IAC4US;
+	dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US;
+#endif
+
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+		goto out;
+
+	ret = -EINVAL;
+	for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) {
+		uint64_t addr = dbg->arch.bp[n].addr;
+		uint32_t type = dbg->arch.bp[n].type;
+
+		if (type == KVMPPC_DEBUG_NONE)
+			continue;
+
+		if (type & ~(KVMPPC_DEBUG_WATCH_READ |
+			     KVMPPC_DEBUG_WATCH_WRITE |
+			     KVMPPC_DEBUG_BREAKPOINT))
+			goto out;
+
+		if (type & KVMPPC_DEBUG_BREAKPOINT) {
+			/* Setting H/W breakpoint */
+			if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++))
+				goto out;
+		} else {
+			/* Setting H/W watchpoint */
+			if (kvmppc_booke_add_watchpoint(dbg_reg, addr,
+							type, w++))
+				goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	vcpu_put(vcpu);
+	return ret;
+}
+
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	vcpu->cpu = smp_processor_id();
+	current->thread.kvm_vcpu = vcpu;
+}
+
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	current->thread.kvm_vcpu = NULL;
+	vcpu->cpu = -1;
+
+	/* Clear pending debug event in DBSR */
+	kvmppc_clear_dbsr();
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return kvm->arch.kvm_ops->init_vm(kvm);
+}
+
+int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu)
+{
+	int i;
+	int r;
+
+	r = vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu);
+	if (r)
+		return r;
+
+	/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
+	vcpu->arch.regs.nip = 0;
+	vcpu->arch.shared->pir = vcpu->vcpu_id;
+	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
+	kvmppc_set_msr(vcpu, 0);
+
+#ifndef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS;
+	vcpu->arch.shadow_pid = 1;
+	vcpu->arch.shared->msr = 0;
+#endif
+
+	/* Eye-catching numbers so we know if the guest takes an interrupt
+	 * before it's programmed its own IVPR/IVORs. */
+	vcpu->arch.ivpr = 0x55550000;
+	for (i = 0; i < BOOKE_IRQPRIO_MAX; i++)
+		vcpu->arch.ivor[i] = 0x7700 | i * 4;
+
+	kvmppc_init_timing_stats(vcpu);
+
+	r = kvmppc_core_vcpu_setup(vcpu);
+	if (r)
+		vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+	kvmppc_sanity_check(vcpu);
+	return r;
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvm->arch.kvm_ops->destroy_vm(kvm);
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
+}
+
+int __init kvmppc_booke_init(void)
+{
+#ifndef CONFIG_KVM_BOOKE_HV
+	unsigned long ivor[16];
+	unsigned long *handler = kvmppc_booke_handler_addr;
+	unsigned long max_ivor = 0;
+	unsigned long handler_len;
+	int i;
+
+	/* We install our own exception handlers by hijacking IVPR. IVPR must
+	 * be 16-bit aligned, so we need a 64KB allocation. */
+	kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+	                                         VCPU_SIZE_ORDER);
+	if (!kvmppc_booke_handlers)
+		return -ENOMEM;
+
+	/* XXX make sure our handlers are smaller than Linux's */
+
+	/* Copy our interrupt handlers to match host IVORs. That way we don't
+	 * have to swap the IVORs on every guest/host transition. */
+	ivor[0] = mfspr(SPRN_IVOR0);
+	ivor[1] = mfspr(SPRN_IVOR1);
+	ivor[2] = mfspr(SPRN_IVOR2);
+	ivor[3] = mfspr(SPRN_IVOR3);
+	ivor[4] = mfspr(SPRN_IVOR4);
+	ivor[5] = mfspr(SPRN_IVOR5);
+	ivor[6] = mfspr(SPRN_IVOR6);
+	ivor[7] = mfspr(SPRN_IVOR7);
+	ivor[8] = mfspr(SPRN_IVOR8);
+	ivor[9] = mfspr(SPRN_IVOR9);
+	ivor[10] = mfspr(SPRN_IVOR10);
+	ivor[11] = mfspr(SPRN_IVOR11);
+	ivor[12] = mfspr(SPRN_IVOR12);
+	ivor[13] = mfspr(SPRN_IVOR13);
+	ivor[14] = mfspr(SPRN_IVOR14);
+	ivor[15] = mfspr(SPRN_IVOR15);
+
+	for (i = 0; i < 16; i++) {
+		if (ivor[i] > max_ivor)
+			max_ivor = i;
+
+		handler_len = handler[i + 1] - handler[i];
+		memcpy((void *)kvmppc_booke_handlers + ivor[i],
+		       (void *)handler[i], handler_len);
+	}
+
+	handler_len = handler[max_ivor + 1] - handler[max_ivor];
+	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
+			   ivor[max_ivor] + handler_len);
+#endif /* !BOOKE_HV */
+	return 0;
+}
+
+void __exit kvmppc_booke_exit(void)
+{
+	free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
+	kvm_exit();
+}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
new file mode 100644
index 0000000000..9c5b8e7601
--- /dev/null
+++ b/arch/powerpc/kvm/booke.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __KVM_BOOKE_H__
+#define __KVM_BOOKE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include <asm/switch_to.h>
+#include "timing.h"
+
+/* interrupt priortity ordering */
+#define BOOKE_IRQPRIO_DATA_STORAGE 0
+#define BOOKE_IRQPRIO_INST_STORAGE 1
+#define BOOKE_IRQPRIO_ALIGNMENT 2
+#define BOOKE_IRQPRIO_PROGRAM 3
+#define BOOKE_IRQPRIO_FP_UNAVAIL 4
+#ifdef CONFIG_SPE_POSSIBLE
+#define BOOKE_IRQPRIO_SPE_UNAVAIL 5
+#define BOOKE_IRQPRIO_SPE_FP_DATA 6
+#define BOOKE_IRQPRIO_SPE_FP_ROUND 7
+#endif
+#ifdef CONFIG_PPC_E500MC
+#define BOOKE_IRQPRIO_ALTIVEC_UNAVAIL 5
+#define BOOKE_IRQPRIO_ALTIVEC_ASSIST 6
+#endif
+#define BOOKE_IRQPRIO_SYSCALL 8
+#define BOOKE_IRQPRIO_AP_UNAVAIL 9
+#define BOOKE_IRQPRIO_DTLB_MISS 10
+#define BOOKE_IRQPRIO_ITLB_MISS 11
+#define BOOKE_IRQPRIO_MACHINE_CHECK 12
+#define BOOKE_IRQPRIO_DEBUG 13
+#define BOOKE_IRQPRIO_CRITICAL 14
+#define BOOKE_IRQPRIO_WATCHDOG 15
+#define BOOKE_IRQPRIO_EXTERNAL 16
+#define BOOKE_IRQPRIO_FIT 17
+#define BOOKE_IRQPRIO_DECREMENTER 18
+#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
+/* Internal pseudo-irqprio for level triggered externals */
+#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
+#define BOOKE_IRQPRIO_DBELL 21
+#define BOOKE_IRQPRIO_DBELL_CRIT 22
+#define BOOKE_IRQPRIO_MAX 23
+
+#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \
+			  (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \
+			  (1 << BOOKE_IRQPRIO_DBELL) | \
+			  (1 << BOOKE_IRQPRIO_DECREMENTER) | \
+			  (1 << BOOKE_IRQPRIO_FIT) | \
+			  (1 << BOOKE_IRQPRIO_EXTERNAL))
+
+#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \
+			  (1 << BOOKE_IRQPRIO_WATCHDOG) | \
+			  (1 << BOOKE_IRQPRIO_CRITICAL))
+
+extern unsigned long kvmppc_booke_handlers;
+extern unsigned long kvmppc_booke_handler_addr[];
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
+
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr);
+void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
+void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
+void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
+
+int kvmppc_booke_emulate_op(struct kvm_vcpu *vcpu,
+                            unsigned int inst, int *advance);
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val);
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val);
+
+/* low-level asm code to transfer guest state */
+void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu);
+void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
+
+/* high-level function, manages flags, host state */
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
+
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
+
+enum int_class {
+	INT_CLASS_NONCRIT,
+	INT_CLASS_CRIT,
+	INT_CLASS_MC,
+	INT_CLASS_DBG,
+};
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type);
+
+extern int kvmppc_core_emulate_op_e500(struct kvm_vcpu *vcpu,
+				       unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong *spr_val);
+extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong *spr_val);
+
+static inline void kvmppc_clear_dbsr(void)
+{
+	mtspr(SPRN_DBSR, mfspr(SPRN_DBSR));
+}
+
+int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr);
+
+#endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
new file mode 100644
index 0000000000..d8d38aca71
--- /dev/null
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -0,0 +1,511 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright IBM Corp. 2008
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/disassemble.h>
+
+#include "booke.h"
+
+#define OP_19_XOP_RFI     50
+#define OP_19_XOP_RFCI    51
+#define OP_19_XOP_RFDI    39
+
+#define OP_31_XOP_MFMSR   83
+#define OP_31_XOP_WRTEE   131
+#define OP_31_XOP_MTMSR   146
+#define OP_31_XOP_WRTEEI  163
+
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.regs.nip = vcpu->arch.shared->srr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
+}
+
+static void kvmppc_emul_rfdi(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.regs.nip = vcpu->arch.dsrr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.dsrr1);
+}
+
+static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.regs.nip = vcpu->arch.csrr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.csrr1);
+}
+
+int kvmppc_booke_emulate_op(struct kvm_vcpu *vcpu,
+                            unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int rs = get_rs(inst);
+	int rt = get_rt(inst);
+
+	switch (get_op(inst)) {
+	case 19:
+		switch (get_xop(inst)) {
+		case OP_19_XOP_RFI:
+			kvmppc_emul_rfi(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
+			*advance = 0;
+			break;
+
+		case OP_19_XOP_RFCI:
+			kvmppc_emul_rfci(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS);
+			*advance = 0;
+			break;
+
+		case OP_19_XOP_RFDI:
+			kvmppc_emul_rfdi(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFDI_EXITS);
+			*advance = 0;
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+
+	case 31:
+		switch (get_xop(inst)) {
+
+		case OP_31_XOP_MFMSR:
+			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
+			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
+			break;
+
+		case OP_31_XOP_MTMSR:
+			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
+			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
+			break;
+
+		case OP_31_XOP_WRTEE:
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
+					| (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+			break;
+
+		case OP_31_XOP_WRTEEI:
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
+							 | (inst & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
+
+/*
+ * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode).
+ * Their backing store is in real registers, and these functions
+ * will return the wrong result if called for them in another context
+ * (such as debugging).
+ */
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+{
+	int emulated = EMULATE_DONE;
+	bool debug_inst = false;
+
+	switch (sprn) {
+	case SPRN_DEAR:
+		vcpu->arch.shared->dar = spr_val;
+		break;
+	case SPRN_ESR:
+		vcpu->arch.shared->esr = spr_val;
+		break;
+	case SPRN_CSRR0:
+		vcpu->arch.csrr0 = spr_val;
+		break;
+	case SPRN_CSRR1:
+		vcpu->arch.csrr1 = spr_val;
+		break;
+	case SPRN_DSRR0:
+		vcpu->arch.dsrr0 = spr_val;
+		break;
+	case SPRN_DSRR1:
+		vcpu->arch.dsrr1 = spr_val;
+		break;
+	case SPRN_IAC1:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac1 = spr_val;
+		break;
+	case SPRN_IAC2:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac2 = spr_val;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case SPRN_IAC3:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac3 = spr_val;
+		break;
+	case SPRN_IAC4:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac4 = spr_val;
+		break;
+#endif
+	case SPRN_DAC1:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dac1 = spr_val;
+		break;
+	case SPRN_DAC2:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dac2 = spr_val;
+		break;
+	case SPRN_DBCR0:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		spr_val &= (DBCR0_IDM | DBCR0_IC | DBCR0_BT | DBCR0_TIE |
+			DBCR0_IAC1 | DBCR0_IAC2 | DBCR0_IAC3 | DBCR0_IAC4  |
+			DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W);
+
+		vcpu->arch.dbg_reg.dbcr0 = spr_val;
+		break;
+	case SPRN_DBCR1:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dbcr1 = spr_val;
+		break;
+	case SPRN_DBCR2:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dbcr2 = spr_val;
+		break;
+	case SPRN_DBSR:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		vcpu->arch.dbsr &= ~spr_val;
+		if (!(vcpu->arch.dbsr & ~DBSR_IDE))
+			kvmppc_core_dequeue_debug(vcpu);
+		break;
+	case SPRN_TSR:
+		kvmppc_clr_tsr_bits(vcpu, spr_val);
+		break;
+	case SPRN_TCR:
+		/*
+		 * WRC is a 2-bit field that is supposed to preserve its
+		 * value once written to non-zero.
+		 */
+		if (vcpu->arch.tcr & TCR_WRC_MASK) {
+			spr_val &= ~TCR_WRC_MASK;
+			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+		}
+		kvmppc_set_tcr(vcpu, spr_val);
+		break;
+
+	case SPRN_DECAR:
+		vcpu->arch.decar = spr_val;
+		break;
+	/*
+	 * Note: SPRG4-7 are user-readable.
+	 * These values are loaded into the real SPRGs when resuming the
+	 * guest (PR-mode only).
+	 */
+	case SPRN_SPRG4:
+		kvmppc_set_sprg4(vcpu, spr_val);
+		break;
+	case SPRN_SPRG5:
+		kvmppc_set_sprg5(vcpu, spr_val);
+		break;
+	case SPRN_SPRG6:
+		kvmppc_set_sprg6(vcpu, spr_val);
+		break;
+	case SPRN_SPRG7:
+		kvmppc_set_sprg7(vcpu, spr_val);
+		break;
+
+	case SPRN_IVPR:
+		vcpu->arch.ivpr = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVPR, spr_val);
+#endif
+		break;
+	case SPRN_IVOR0:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val;
+		break;
+	case SPRN_IVOR1:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val;
+		break;
+	case SPRN_IVOR2:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR2, spr_val);
+#endif
+		break;
+	case SPRN_IVOR3:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
+		break;
+	case SPRN_IVOR4:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val;
+		break;
+	case SPRN_IVOR5:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val;
+		break;
+	case SPRN_IVOR6:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val;
+		break;
+	case SPRN_IVOR7:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR8:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR8, spr_val);
+#endif
+		break;
+	case SPRN_IVOR9:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR10:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val;
+		break;
+	case SPRN_IVOR11:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val;
+		break;
+	case SPRN_IVOR12:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val;
+		break;
+	case SPRN_IVOR13:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val;
+		break;
+	case SPRN_IVOR14:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val;
+		break;
+	case SPRN_IVOR15:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
+		break;
+	case SPRN_MCSR:
+		vcpu->arch.mcsr &= ~spr_val;
+		break;
+#if defined(CONFIG_64BIT)
+	case SPRN_EPCR:
+		kvmppc_set_epcr(vcpu, spr_val);
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+#endif
+		break;
+#endif
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (debug_inst) {
+		current->thread.debug = vcpu->arch.dbg_reg;
+		switch_booke_debug_regs(&vcpu->arch.dbg_reg);
+	}
+	return emulated;
+}
+
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_IVPR:
+		*spr_val = vcpu->arch.ivpr;
+		break;
+	case SPRN_DEAR:
+		*spr_val = vcpu->arch.shared->dar;
+		break;
+	case SPRN_ESR:
+		*spr_val = vcpu->arch.shared->esr;
+		break;
+	case SPRN_EPR:
+		*spr_val = vcpu->arch.epr;
+		break;
+	case SPRN_CSRR0:
+		*spr_val = vcpu->arch.csrr0;
+		break;
+	case SPRN_CSRR1:
+		*spr_val = vcpu->arch.csrr1;
+		break;
+	case SPRN_DSRR0:
+		*spr_val = vcpu->arch.dsrr0;
+		break;
+	case SPRN_DSRR1:
+		*spr_val = vcpu->arch.dsrr1;
+		break;
+	case SPRN_IAC1:
+		*spr_val = vcpu->arch.dbg_reg.iac1;
+		break;
+	case SPRN_IAC2:
+		*spr_val = vcpu->arch.dbg_reg.iac2;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case SPRN_IAC3:
+		*spr_val = vcpu->arch.dbg_reg.iac3;
+		break;
+	case SPRN_IAC4:
+		*spr_val = vcpu->arch.dbg_reg.iac4;
+		break;
+#endif
+	case SPRN_DAC1:
+		*spr_val = vcpu->arch.dbg_reg.dac1;
+		break;
+	case SPRN_DAC2:
+		*spr_val = vcpu->arch.dbg_reg.dac2;
+		break;
+	case SPRN_DBCR0:
+		*spr_val = vcpu->arch.dbg_reg.dbcr0;
+		if (vcpu->guest_debug)
+			*spr_val = *spr_val | DBCR0_EDM;
+		break;
+	case SPRN_DBCR1:
+		*spr_val = vcpu->arch.dbg_reg.dbcr1;
+		break;
+	case SPRN_DBCR2:
+		*spr_val = vcpu->arch.dbg_reg.dbcr2;
+		break;
+	case SPRN_DBSR:
+		*spr_val = vcpu->arch.dbsr;
+		break;
+	case SPRN_TSR:
+		*spr_val = vcpu->arch.tsr;
+		break;
+	case SPRN_TCR:
+		*spr_val = vcpu->arch.tcr;
+		break;
+
+	case SPRN_IVOR0:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+		break;
+	case SPRN_IVOR1:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+		break;
+	case SPRN_IVOR2:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+		break;
+	case SPRN_IVOR3:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+		break;
+	case SPRN_IVOR4:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+		break;
+	case SPRN_IVOR5:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+		break;
+	case SPRN_IVOR6:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+		break;
+	case SPRN_IVOR7:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+		break;
+	case SPRN_IVOR8:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+		break;
+	case SPRN_IVOR9:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+		break;
+	case SPRN_IVOR10:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+		break;
+	case SPRN_IVOR11:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+		break;
+	case SPRN_IVOR12:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+		break;
+	case SPRN_IVOR13:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+		break;
+	case SPRN_IVOR14:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+		break;
+	case SPRN_IVOR15:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+		break;
+	case SPRN_MCSR:
+		*spr_val = vcpu->arch.mcsr;
+		break;
+#if defined(CONFIG_64BIT)
+	case SPRN_EPCR:
+		*spr_val = vcpu->arch.epcr;
+		break;
+#endif
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
new file mode 100644
index 0000000000..205545d820
--- /dev/null
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -0,0 +1,535 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+
+/* The host stack layout: */
+#define HOST_R1         0 /* Implied by stwu. */
+#define HOST_CALLEE_LR  4
+#define HOST_RUN        8
+/* r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option. */
+#define HOST_R2         12
+#define HOST_CR         16
+#define HOST_NV_GPRS    20
+#define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * 4))
+#define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n)
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + 4)
+#define HOST_STACK_SIZE (((HOST_MIN_STACK_SIZE + 15) / 16) * 16) /* Align. */
+#define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */
+
+#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_DEBUG))
+
+#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_ALIGNMENT))
+
+#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
+                       (1<<BOOKE_INTERRUPT_INST_STORAGE) | \
+                       (1<<BOOKE_INTERRUPT_PROGRAM) | \
+                       (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                       (1<<BOOKE_INTERRUPT_ALIGNMENT))
+
+.macro __KVM_HANDLER ivor_nr scratch srr0
+	/* Get pointer to vcpu and record exit number. */
+	mtspr	\scratch , r4
+	mfspr   r4, SPRN_SPRG_THREAD
+	lwz     r4, THREAD_KVM_VCPU(r4)
+	stw	r3, VCPU_GPR(R3)(r4)
+	stw	r5, VCPU_GPR(R5)(r4)
+	stw	r6, VCPU_GPR(R6)(r4)
+	mfspr	r3, \scratch
+	mfctr	r5
+	stw	r3, VCPU_GPR(R4)(r4)
+	stw	r5, VCPU_CTR(r4)
+	mfspr	r3, \srr0
+	lis	r6, kvmppc_resume_host@h
+	stw	r3, VCPU_PC(r4)
+	li	r5, \ivor_nr
+	ori	r6, r6, kvmppc_resume_host@l
+	mtctr	r6
+	bctr
+.endm
+
+.macro KVM_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+	__KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
+.macro KVM_DBG_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+	mtspr   \scratch, r4
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	stw	r3, VCPU_CRIT_SAVE(r4)
+	mfcr	r3
+	mfspr	r4, SPRN_CSRR1
+	andi.	r4, r4, MSR_PR
+	bne	1f
+	/* debug interrupt happened in enter/exit path */
+	mfspr   r4, SPRN_CSRR1
+	rlwinm  r4, r4, 0, ~MSR_DE
+	mtspr   SPRN_CSRR1, r4
+	lis	r4, 0xffff
+	ori	r4, r4, 0xffff
+	mtspr	SPRN_DBSR, r4
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	mtcr	r3
+	lwz     r3, VCPU_CRIT_SAVE(r4)
+	mfspr   r4, \scratch
+	rfci
+1:	/* debug interrupt happened in guest */
+	mtcr	r3
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	lwz     r3, VCPU_CRIT_SAVE(r4)
+	mfspr   r4, \scratch
+	__KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
+.macro KVM_HANDLER_ADDR ivor_nr
+	.long	kvmppc_handler_\ivor_nr
+.endm
+
+.macro KVM_HANDLER_END
+	.long	kvmppc_handlers_end
+.endm
+
+_GLOBAL(kvmppc_handlers_start)
+KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK  SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
+KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+_GLOBAL(kvmppc_handlers_end)
+
+/* Registers:
+ *  SPRG_SCRATCH0: guest r4
+ *  r4: vcpu pointer
+ *  r5: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+	mfcr	r3
+	stw	r3, VCPU_CR(r4)
+	stw	r7, VCPU_GPR(R7)(r4)
+	stw	r8, VCPU_GPR(R8)(r4)
+	stw	r9, VCPU_GPR(R9)(r4)
+
+	li	r6, 1
+	slw	r6, r6, r5
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:
+	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	bne	1b
+	stw	r8, VCPU_TIMING_EXIT_TBL(r4)
+	stw	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
+	/* Save the faulting instruction and all GPRs for emulation. */
+	andi.	r7, r6, NEED_INST_MASK
+	beq	..skip_inst_copy
+	mfspr	r9, SPRN_SRR0
+	mfmsr	r8
+	ori	r7, r8, MSR_DS
+	mtmsr	r7
+	isync
+	lwz	r9, 0(r9)
+	mtmsr	r8
+	isync
+	stw	r9, VCPU_LAST_INST(r4)
+
+	stw	r15, VCPU_GPR(R15)(r4)
+	stw	r16, VCPU_GPR(R16)(r4)
+	stw	r17, VCPU_GPR(R17)(r4)
+	stw	r18, VCPU_GPR(R18)(r4)
+	stw	r19, VCPU_GPR(R19)(r4)
+	stw	r20, VCPU_GPR(R20)(r4)
+	stw	r21, VCPU_GPR(R21)(r4)
+	stw	r22, VCPU_GPR(R22)(r4)
+	stw	r23, VCPU_GPR(R23)(r4)
+	stw	r24, VCPU_GPR(R24)(r4)
+	stw	r25, VCPU_GPR(R25)(r4)
+	stw	r26, VCPU_GPR(R26)(r4)
+	stw	r27, VCPU_GPR(R27)(r4)
+	stw	r28, VCPU_GPR(R28)(r4)
+	stw	r29, VCPU_GPR(R29)(r4)
+	stw	r30, VCPU_GPR(R30)(r4)
+	stw	r31, VCPU_GPR(R31)(r4)
+..skip_inst_copy:
+
+	/* Also grab DEAR and ESR before the host can clobber them. */
+
+	andi.	r7, r6, NEED_DEAR_MASK
+	beq	..skip_dear
+	mfspr	r9, SPRN_DEAR
+	stw	r9, VCPU_FAULT_DEAR(r4)
+..skip_dear:
+
+	andi.	r7, r6, NEED_ESR_MASK
+	beq	..skip_esr
+	mfspr	r9, SPRN_ESR
+	stw	r9, VCPU_FAULT_ESR(r4)
+..skip_esr:
+
+	/* Save remaining volatile guest register state to vcpu. */
+	stw	r0, VCPU_GPR(R0)(r4)
+	stw	r1, VCPU_GPR(R1)(r4)
+	stw	r2, VCPU_GPR(R2)(r4)
+	stw	r10, VCPU_GPR(R10)(r4)
+	stw	r11, VCPU_GPR(R11)(r4)
+	stw	r12, VCPU_GPR(R12)(r4)
+	stw	r13, VCPU_GPR(R13)(r4)
+	stw	r14, VCPU_GPR(R14)(r4) /* We need a NV GPR below. */
+	mflr	r3
+	stw	r3, VCPU_LR(r4)
+	mfxer	r3
+	stw	r3, VCPU_XER(r4)
+
+	/* Restore host stack pointer and PID before IVPR, since the host
+	 * exception handlers use them. */
+	lwz	r1, VCPU_HOST_STACK(r4)
+	lwz	r3, VCPU_HOST_PID(r4)
+	mtspr	SPRN_PID, r3
+
+#ifdef CONFIG_PPC_85xx
+	/* we cheat and know that Linux doesn't use PID1 which is always 0 */
+	lis	r3, 0
+	mtspr	SPRN_PID1, r3
+#endif
+
+	/* Restore host IVPR before re-enabling interrupts. We cheat and know
+	 * that Linux IVPR is always 0xc0000000. */
+	lis	r3, 0xc000
+	mtspr	SPRN_IVPR, r3
+
+	/* Switch to kernel stack and jump to handler. */
+	LOAD_REG_ADDR(r3, kvmppc_handle_exit)
+	mtctr	r3
+	mr	r3, r4
+	lwz	r2, HOST_R2(r1)
+	mr	r14, r4 /* Save vcpu pointer. */
+
+	bctrl	/* kvmppc_handle_exit() */
+
+	/* Restore vcpu pointer and the nonvolatiles we used. */
+	mr	r4, r14
+	lwz	r14, VCPU_GPR(R14)(r4)
+
+	/* Sometimes instruction emulation must restore complete GPR state. */
+	andi.	r5, r3, RESUME_FLAG_NV
+	beq	..skip_nv_load
+	lwz	r15, VCPU_GPR(R15)(r4)
+	lwz	r16, VCPU_GPR(R16)(r4)
+	lwz	r17, VCPU_GPR(R17)(r4)
+	lwz	r18, VCPU_GPR(R18)(r4)
+	lwz	r19, VCPU_GPR(R19)(r4)
+	lwz	r20, VCPU_GPR(R20)(r4)
+	lwz	r21, VCPU_GPR(R21)(r4)
+	lwz	r22, VCPU_GPR(R22)(r4)
+	lwz	r23, VCPU_GPR(R23)(r4)
+	lwz	r24, VCPU_GPR(R24)(r4)
+	lwz	r25, VCPU_GPR(R25)(r4)
+	lwz	r26, VCPU_GPR(R26)(r4)
+	lwz	r27, VCPU_GPR(R27)(r4)
+	lwz	r28, VCPU_GPR(R28)(r4)
+	lwz	r29, VCPU_GPR(R29)(r4)
+	lwz	r30, VCPU_GPR(R30)(r4)
+	lwz	r31, VCPU_GPR(R31)(r4)
+..skip_nv_load:
+
+	/* Should we return to the guest? */
+	andi.	r5, r3, RESUME_FLAG_HOST
+	beq	lightweight_exit
+
+	srawi	r3, r3, 2 /* Shift -ERR back down. */
+
+heavyweight_exit:
+	/* Not returning to guest. */
+
+#ifdef CONFIG_SPE
+	/* save guest SPEFSCR and load host SPEFSCR */
+	mfspr	r9, SPRN_SPEFSCR
+	stw	r9, VCPU_SPEFSCR(r4)
+	lwz	r9, VCPU_HOST_SPEFSCR(r4)
+	mtspr	SPRN_SPEFSCR, r9
+#endif
+
+	/* We already saved guest volatile register state; now save the
+	 * non-volatiles. */
+	stw	r15, VCPU_GPR(R15)(r4)
+	stw	r16, VCPU_GPR(R16)(r4)
+	stw	r17, VCPU_GPR(R17)(r4)
+	stw	r18, VCPU_GPR(R18)(r4)
+	stw	r19, VCPU_GPR(R19)(r4)
+	stw	r20, VCPU_GPR(R20)(r4)
+	stw	r21, VCPU_GPR(R21)(r4)
+	stw	r22, VCPU_GPR(R22)(r4)
+	stw	r23, VCPU_GPR(R23)(r4)
+	stw	r24, VCPU_GPR(R24)(r4)
+	stw	r25, VCPU_GPR(R25)(r4)
+	stw	r26, VCPU_GPR(R26)(r4)
+	stw	r27, VCPU_GPR(R27)(r4)
+	stw	r28, VCPU_GPR(R28)(r4)
+	stw	r29, VCPU_GPR(R29)(r4)
+	stw	r30, VCPU_GPR(R30)(r4)
+	stw	r31, VCPU_GPR(R31)(r4)
+
+	/* Load host non-volatile register state from host stack. */
+	lwz	r14, HOST_NV_GPR(R14)(r1)
+	lwz	r15, HOST_NV_GPR(R15)(r1)
+	lwz	r16, HOST_NV_GPR(R16)(r1)
+	lwz	r17, HOST_NV_GPR(R17)(r1)
+	lwz	r18, HOST_NV_GPR(R18)(r1)
+	lwz	r19, HOST_NV_GPR(R19)(r1)
+	lwz	r20, HOST_NV_GPR(R20)(r1)
+	lwz	r21, HOST_NV_GPR(R21)(r1)
+	lwz	r22, HOST_NV_GPR(R22)(r1)
+	lwz	r23, HOST_NV_GPR(R23)(r1)
+	lwz	r24, HOST_NV_GPR(R24)(r1)
+	lwz	r25, HOST_NV_GPR(R25)(r1)
+	lwz	r26, HOST_NV_GPR(R26)(r1)
+	lwz	r27, HOST_NV_GPR(R27)(r1)
+	lwz	r28, HOST_NV_GPR(R28)(r1)
+	lwz	r29, HOST_NV_GPR(R29)(r1)
+	lwz	r30, HOST_NV_GPR(R30)(r1)
+	lwz	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Return to kvm_vcpu_run(). */
+	lwz	r4, HOST_STACK_LR(r1)
+	lwz	r5, HOST_CR(r1)
+	addi	r1, r1, HOST_STACK_SIZE
+	mtlr	r4
+	mtcr	r5
+	/* r3 still contains the return code from kvmppc_handle_exit(). */
+	blr
+
+
+/* Registers:
+ *  r3: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+	stwu	r1, -HOST_STACK_SIZE(r1)
+	stw	r1, VCPU_HOST_STACK(r3)	/* Save stack pointer to vcpu. */
+
+	/* Save host state to stack. */
+	mr	r4, r3
+	mflr	r3
+	stw	r3, HOST_STACK_LR(r1)
+	mfcr	r5
+	stw	r5, HOST_CR(r1)
+
+	/* Save host non-volatile register state to stack. */
+	stw	r14, HOST_NV_GPR(R14)(r1)
+	stw	r15, HOST_NV_GPR(R15)(r1)
+	stw	r16, HOST_NV_GPR(R16)(r1)
+	stw	r17, HOST_NV_GPR(R17)(r1)
+	stw	r18, HOST_NV_GPR(R18)(r1)
+	stw	r19, HOST_NV_GPR(R19)(r1)
+	stw	r20, HOST_NV_GPR(R20)(r1)
+	stw	r21, HOST_NV_GPR(R21)(r1)
+	stw	r22, HOST_NV_GPR(R22)(r1)
+	stw	r23, HOST_NV_GPR(R23)(r1)
+	stw	r24, HOST_NV_GPR(R24)(r1)
+	stw	r25, HOST_NV_GPR(R25)(r1)
+	stw	r26, HOST_NV_GPR(R26)(r1)
+	stw	r27, HOST_NV_GPR(R27)(r1)
+	stw	r28, HOST_NV_GPR(R28)(r1)
+	stw	r29, HOST_NV_GPR(R29)(r1)
+	stw	r30, HOST_NV_GPR(R30)(r1)
+	stw	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Load guest non-volatiles. */
+	lwz	r14, VCPU_GPR(R14)(r4)
+	lwz	r15, VCPU_GPR(R15)(r4)
+	lwz	r16, VCPU_GPR(R16)(r4)
+	lwz	r17, VCPU_GPR(R17)(r4)
+	lwz	r18, VCPU_GPR(R18)(r4)
+	lwz	r19, VCPU_GPR(R19)(r4)
+	lwz	r20, VCPU_GPR(R20)(r4)
+	lwz	r21, VCPU_GPR(R21)(r4)
+	lwz	r22, VCPU_GPR(R22)(r4)
+	lwz	r23, VCPU_GPR(R23)(r4)
+	lwz	r24, VCPU_GPR(R24)(r4)
+	lwz	r25, VCPU_GPR(R25)(r4)
+	lwz	r26, VCPU_GPR(R26)(r4)
+	lwz	r27, VCPU_GPR(R27)(r4)
+	lwz	r28, VCPU_GPR(R28)(r4)
+	lwz	r29, VCPU_GPR(R29)(r4)
+	lwz	r30, VCPU_GPR(R30)(r4)
+	lwz	r31, VCPU_GPR(R31)(r4)
+
+#ifdef CONFIG_SPE
+	/* save host SPEFSCR and load guest SPEFSCR */
+	mfspr	r3, SPRN_SPEFSCR
+	stw	r3, VCPU_HOST_SPEFSCR(r4)
+	lwz	r3, VCPU_SPEFSCR(r4)
+	mtspr	SPRN_SPEFSCR, r3
+#endif
+
+lightweight_exit:
+	stw	r2, HOST_R2(r1)
+
+	mfspr	r3, SPRN_PID
+	stw	r3, VCPU_HOST_PID(r4)
+	lwz	r3, VCPU_SHADOW_PID(r4)
+	mtspr	SPRN_PID, r3
+
+#ifdef CONFIG_PPC_85xx
+	lwz	r3, VCPU_SHADOW_PID1(r4)
+	mtspr	SPRN_PID1, r3
+#endif
+
+	/* Load some guest volatiles. */
+	lwz	r0, VCPU_GPR(R0)(r4)
+	lwz	r2, VCPU_GPR(R2)(r4)
+	lwz	r9, VCPU_GPR(R9)(r4)
+	lwz	r10, VCPU_GPR(R10)(r4)
+	lwz	r11, VCPU_GPR(R11)(r4)
+	lwz	r12, VCPU_GPR(R12)(r4)
+	lwz	r13, VCPU_GPR(R13)(r4)
+	lwz	r3, VCPU_LR(r4)
+	mtlr	r3
+	lwz	r3, VCPU_XER(r4)
+	mtxer	r3
+
+	/* Switch the IVPR. XXX If we take a TLB miss after this we're screwed,
+	 * so how do we make sure vcpu won't fault? */
+	lis	r8, kvmppc_booke_handlers@ha
+	lwz	r8, kvmppc_booke_handlers@l(r8)
+	mtspr	SPRN_IVPR, r8
+
+	lwz	r5, VCPU_SHARED(r4)
+
+	/* Can't switch the stack pointer until after IVPR is switched,
+	 * because host interrupt handlers would get confused. */
+	lwz	r1, VCPU_GPR(R1)(r4)
+
+	/*
+	 * Host interrupt handlers may have clobbered these
+	 * guest-readable SPRGs, or the guest kernel may have
+	 * written directly to the shared area, so we
+	 * need to reload them here with the guest's values.
+	 */
+	PPC_LD(r3, VCPU_SHARED_SPRG4, r5)
+	mtspr	SPRN_SPRG4W, r3
+	PPC_LD(r3, VCPU_SHARED_SPRG5, r5)
+	mtspr	SPRN_SPRG5W, r3
+	PPC_LD(r3, VCPU_SHARED_SPRG6, r5)
+	mtspr	SPRN_SPRG6W, r3
+	PPC_LD(r3, VCPU_SHARED_SPRG7, r5)
+	mtspr	SPRN_SPRG7W, r3
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r7, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	bne	1b
+	stw	r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
+	/* Finish loading guest volatiles and jump to guest. */
+	lwz	r3, VCPU_CTR(r4)
+	lwz	r5, VCPU_CR(r4)
+	lwz	r6, VCPU_PC(r4)
+	lwz	r7, VCPU_SHADOW_MSR(r4)
+	mtctr	r3
+	mtcr	r5
+	mtsrr0	r6
+	mtsrr1	r7
+	lwz	r5, VCPU_GPR(R5)(r4)
+	lwz	r6, VCPU_GPR(R6)(r4)
+	lwz	r7, VCPU_GPR(R7)(r4)
+	lwz	r8, VCPU_GPR(R8)(r4)
+
+	/* Clear any debug events which occurred since we disabled MSR[DE].
+	 * XXX This gives us a 3-instruction window in which a breakpoint
+	 * intended for guest context could fire in the host instead. */
+	lis	r3, 0xffff
+	ori	r3, r3, 0xffff
+	mtspr	SPRN_DBSR, r3
+
+	lwz	r3, VCPU_GPR(R3)(r4)
+	lwz	r4, VCPU_GPR(R4)(r4)
+	rfi
+
+	.data
+	.align	4
+	.globl	kvmppc_booke_handler_addr
+kvmppc_booke_handler_addr:
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND
+KVM_HANDLER_END /*Always keep this in end*/
+
+#ifdef CONFIG_SPE
+_GLOBAL(kvmppc_save_guest_spe)
+	cmpi	0,r3,0
+	beqlr-
+	SAVE_32EVRS(0, r4, r3, VCPU_EVR)
+	evxor   evr6, evr6, evr6
+	evmwumiaa evr6, evr6, evr6
+	li	r4,VCPU_ACC
+	evstddx evr6, r4, r3		/* save acc */
+	blr
+
+_GLOBAL(kvmppc_load_guest_spe)
+	cmpi	0,r3,0
+	beqlr-
+	li      r4,VCPU_ACC
+	evlddx  evr6,r4,r3
+	evmra   evr6,evr6		/* load acc */
+	REST_32EVRS(0, r4, r3, VCPU_EVR)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
new file mode 100644
index 0000000000..8b4a402217
--- /dev/null
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -0,0 +1,673 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright (C) 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ * Author: Scott Wood <scotwood@freescale.com>
+ * Author: Mihai Caraman <mihai.caraman@freescale.com>
+ *
+ * This file is derived from arch/powerpc/kvm/booke_interrupts.S
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+#include <asm/bitsperlong.h>
+
+#ifdef CONFIG_64BIT
+#include <asm/exception-64e.h>
+#include <asm/hw_irq.h>
+#include <asm/irqflags.h>
+#else
+#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
+#endif
+
+#define LONGBYTES		(BITS_PER_LONG / 8)
+
+#define VCPU_GUEST_SPRG(n)	(VCPU_GUEST_SPRGS + (n * LONGBYTES))
+
+/* The host stack layout: */
+#define HOST_R1         0 /* Implied by stwu. */
+#define HOST_CALLEE_LR  PPC_LR_STKOFF
+#define HOST_RUN        (HOST_CALLEE_LR + LONGBYTES)
+/*
+ * r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option.
+ */
+#define HOST_R2         (HOST_RUN + LONGBYTES)
+#define HOST_CR         (HOST_R2 + LONGBYTES)
+#define HOST_NV_GPRS    (HOST_CR + LONGBYTES)
+#define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
+#define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n)
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES)
+#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
+/* LR in caller stack frame. */
+#define HOST_STACK_LR	(HOST_STACK_SIZE + PPC_LR_STKOFF)
+
+#define NEED_EMU		0x00000001 /* emulation -- save nv regs */
+#define NEED_DEAR		0x00000002 /* save faulting DEAR */
+#define NEED_ESR		0x00000004 /* save faulting ESR */
+
+/*
+ * On entry:
+ * r4 = vcpu, r5 = srr0, r6 = srr1
+ * saved in vcpu: cr, ctr, r3-r13
+ */
+.macro kvm_handler_common intno, srr0, flags
+	/* Restore host stack pointer */
+	PPC_STL	r1, VCPU_GPR(R1)(r4)
+	PPC_STL	r2, VCPU_GPR(R2)(r4)
+	PPC_LL	r1, VCPU_HOST_STACK(r4)
+	PPC_LL	r2, HOST_R2(r1)
+
+START_BTB_FLUSH_SECTION
+	BTB_FLUSH(r10)
+END_BTB_FLUSH_SECTION
+
+	mfspr	r10, SPRN_PID
+	lwz	r8, VCPU_HOST_PID(r4)
+	PPC_LL	r11, VCPU_SHARED(r4)
+	PPC_STL	r14, VCPU_GPR(R14)(r4) /* We need a non-volatile GPR. */
+	li	r14, \intno
+
+	stw	r10, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r8
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	stw	r8, VCPU_TIMING_EXIT_TBL(r4)
+	bne-	1b
+	stw	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
+	oris	r8, r6, MSR_CE@h
+	PPC_STD(r6, VCPU_SHARED_MSR, r11)
+	ori	r8, r8, MSR_ME | MSR_RI
+	PPC_STL	r5, VCPU_PC(r4)
+
+	/*
+	 * Make sure CE/ME/RI are set (if appropriate for exception type)
+	 * whether or not the guest had it set.  Since mfmsr/mtmsr are
+	 * somewhat expensive, skip in the common case where the guest
+	 * had all these bits set (and thus they're still set if
+	 * appropriate for the exception type).
+	 */
+	cmpw	r6, r8
+	beq	1f
+	mfmsr	r7
+	.if	\srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0
+	oris	r7, r7, MSR_CE@h
+	.endif
+	.if	\srr0 != SPRN_MCSRR0
+	ori	r7, r7, MSR_ME | MSR_RI
+	.endif
+	mtmsr	r7
+1:
+
+	.if	\flags & NEED_EMU
+	PPC_STL	r15, VCPU_GPR(R15)(r4)
+	PPC_STL	r16, VCPU_GPR(R16)(r4)
+	PPC_STL	r17, VCPU_GPR(R17)(r4)
+	PPC_STL	r18, VCPU_GPR(R18)(r4)
+	PPC_STL	r19, VCPU_GPR(R19)(r4)
+	PPC_STL	r20, VCPU_GPR(R20)(r4)
+	PPC_STL	r21, VCPU_GPR(R21)(r4)
+	PPC_STL	r22, VCPU_GPR(R22)(r4)
+	PPC_STL	r23, VCPU_GPR(R23)(r4)
+	PPC_STL	r24, VCPU_GPR(R24)(r4)
+	PPC_STL	r25, VCPU_GPR(R25)(r4)
+	PPC_STL	r26, VCPU_GPR(R26)(r4)
+	PPC_STL	r27, VCPU_GPR(R27)(r4)
+	PPC_STL	r28, VCPU_GPR(R28)(r4)
+	PPC_STL	r29, VCPU_GPR(R29)(r4)
+	PPC_STL	r30, VCPU_GPR(R30)(r4)
+	PPC_STL	r31, VCPU_GPR(R31)(r4)
+
+	/*
+	 * We don't use external PID support. lwepx faults would need to be
+	 * handled by KVM and this implies aditional code in DO_KVM (for
+	 * DTB_MISS, DSI and LRAT) to check ESR[EPID] and EPLC[EGS] which
+	 * is too intrusive for the host. Get last instuction in
+	 * kvmppc_get_last_inst().
+	 */
+	li	r9, KVM_INST_FETCH_FAILED
+	PPC_STL	r9, VCPU_LAST_INST(r4)
+	.endif
+
+	.if	\flags & NEED_ESR
+	mfspr	r8, SPRN_ESR
+	PPC_STL	r8, VCPU_FAULT_ESR(r4)
+	.endif
+
+	.if	\flags & NEED_DEAR
+	mfspr	r9, SPRN_DEAR
+	PPC_STL	r9, VCPU_FAULT_DEAR(r4)
+	.endif
+
+	b	kvmppc_resume_host
+.endm
+
+#ifdef CONFIG_64BIT
+/* Exception types */
+#define EX_GEN			1
+#define EX_GDBELL		2
+#define EX_DBG			3
+#define EX_MC			4
+#define EX_CRIT			5
+#define EX_TLB			6
+
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags
+ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mr	r11, r4
+	/*
+	 * Get vcpu from Paca: paca->__current.thread->kvm_vcpu
+	 */
+	PPC_LL	r4, PACACURRENT(r13)
+	PPC_LL	r4, (THREAD + THREAD_KVM_VCPU)(r4)
+	PPC_STL	r10, VCPU_CR(r4)
+	PPC_STL r11, VCPU_GPR(R4)(r4)
+	PPC_STL	r5, VCPU_GPR(R5)(r4)
+	PPC_STL	r6, VCPU_GPR(R6)(r4)
+	PPC_STL	r8, VCPU_GPR(R8)(r4)
+	PPC_STL	r9, VCPU_GPR(R9)(r4)
+	.if \type == EX_TLB
+	PPC_LL	r5, EX_TLB_R13(r12)
+	PPC_LL	r6, EX_TLB_R10(r12)
+	PPC_LL	r8, EX_TLB_R11(r12)
+	mfspr	r12, \scratch
+	.else
+	mfspr	r5, \scratch
+	PPC_LL	r6, (\paca_ex + \ex_r10)(r13)
+	PPC_LL	r8, (\paca_ex + \ex_r11)(r13)
+	.endif
+	PPC_STL r5, VCPU_GPR(R13)(r4)
+	PPC_STL r3, VCPU_GPR(R3)(r4)
+	PPC_STL r7, VCPU_GPR(R7)(r4)
+	PPC_STL r12, VCPU_GPR(R12)(r4)
+	PPC_STL r6, VCPU_GPR(R10)(r4)
+	PPC_STL r8, VCPU_GPR(R11)(r4)
+	mfctr	r5
+	PPC_STL	r5, VCPU_CTR(r4)
+	mfspr	r5, \srr0
+	mfspr	r6, \srr1
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+#define EX_PARAMS(type) \
+	EX_##type, \
+	SPRN_SPRG_##type##_SCRATCH, \
+	PACA_EX##type, \
+	EX_R10, \
+	EX_R11
+
+#define EX_PARAMS_TLB \
+	EX_TLB, \
+	SPRN_SPRG_GEN_SCRATCH, \
+	PACA_EXTLB, \
+	EX_TLB_R10, \
+	EX_TLB_R11
+
+kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \
+	SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU)
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\
+	SPRN_CSRR0, SPRN_CSRR1, 0
+/*
+ * Only bolted TLB miss exception handlers are supported for now
+ */
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \
+	SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
+	SPRN_DSRR0, SPRN_DSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+#else
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	PPC_LL	r11, THREAD_KVM_VCPU(r10)
+	PPC_STL r3, VCPU_GPR(R3)(r11)
+	mfspr	r3, SPRN_SPRG_RSCRATCH0
+	PPC_STL	r4, VCPU_GPR(R4)(r11)
+	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
+	PPC_STL	r5, VCPU_GPR(R5)(r11)
+	PPC_STL	r13, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(R10)(r11)
+	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
+	PPC_STL	r6, VCPU_GPR(R6)(r11)
+	PPC_STL	r4, VCPU_GPR(R11)(r11)
+	mfspr	r6, \srr1
+	PPC_STL	r7, VCPU_GPR(R7)(r11)
+	PPC_STL	r8, VCPU_GPR(R8)(r11)
+	PPC_STL	r9, VCPU_GPR(R9)(r11)
+	PPC_STL r3, VCPU_GPR(R13)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(R12)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+.macro kvm_lvl_handler intno scratch srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mfspr	r10, SPRN_SPRG_THREAD
+	PPC_LL	r11, THREAD_KVM_VCPU(r10)
+	PPC_STL r3, VCPU_GPR(R3)(r11)
+	mfspr	r3, \scratch
+	PPC_STL	r4, VCPU_GPR(R4)(r11)
+	PPC_LL	r4, GPR9(r8)
+	PPC_STL	r5, VCPU_GPR(R5)(r11)
+	PPC_STL	r9, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(R8)(r11)
+	PPC_LL	r3, GPR10(r8)
+	PPC_STL	r6, VCPU_GPR(R6)(r11)
+	PPC_STL	r4, VCPU_GPR(R9)(r11)
+	mfspr	r6, \srr1
+	PPC_LL	r4, GPR11(r8)
+	PPC_STL	r7, VCPU_GPR(R7)(r11)
+	PPC_STL r3, VCPU_GPR(R10)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(R12)(r11)
+	PPC_STL r13, VCPU_GPR(R13)(r11)
+	PPC_STL	r4, VCPU_GPR(R11)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \
+	SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU)
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
+#endif
+
+/* Registers:
+ *  SPRG_SCRATCH0: guest r10
+ *  r4: vcpu pointer
+ *  r11: vcpu->arch.shared
+ *  r14: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+	/* Save remaining volatile guest register state to vcpu. */
+	mfspr	r3, SPRN_VRSAVE
+	PPC_STL	r0, VCPU_GPR(R0)(r4)
+	mflr	r5
+	mfspr	r6, SPRN_SPRG4
+	PPC_STL	r5, VCPU_LR(r4)
+	mfspr	r7, SPRN_SPRG5
+	stw	r3, VCPU_VRSAVE(r4)
+#ifdef CONFIG_64BIT
+	PPC_LL	r3, PACA_SPRG_VDSO(r13)
+#endif
+	mfspr	r5, SPRN_SPRG9
+	PPC_STD(r6, VCPU_SHARED_SPRG4, r11)
+	mfspr	r8, SPRN_SPRG6
+	PPC_STD(r7, VCPU_SHARED_SPRG5, r11)
+	mfspr	r9, SPRN_SPRG7
+#ifdef CONFIG_64BIT
+	mtspr	SPRN_SPRG_VDSO_WRITE, r3
+#endif
+	PPC_STD(r5, VCPU_SPRG9, r4)
+	PPC_STD(r8, VCPU_SHARED_SPRG6, r11)
+	mfxer	r3
+	PPC_STD(r9, VCPU_SHARED_SPRG7, r11)
+
+	/* save guest MAS registers and restore host mas4 & mas6 */
+	mfspr	r5, SPRN_MAS0
+	PPC_STL	r3, VCPU_XER(r4)
+	mfspr	r6, SPRN_MAS1
+	stw	r5, VCPU_SHARED_MAS0(r11)
+	mfspr	r7, SPRN_MAS2
+	stw	r6, VCPU_SHARED_MAS1(r11)
+	PPC_STD(r7, VCPU_SHARED_MAS2, r11)
+	mfspr	r5, SPRN_MAS3
+	mfspr	r6, SPRN_MAS4
+	stw	r5, VCPU_SHARED_MAS7_3+4(r11)
+	mfspr	r7, SPRN_MAS6
+	stw	r6, VCPU_SHARED_MAS4(r11)
+	mfspr	r5, SPRN_MAS7
+	lwz	r6, VCPU_HOST_MAS4(r4)
+	stw	r7, VCPU_SHARED_MAS6(r11)
+	lwz	r8, VCPU_HOST_MAS6(r4)
+	mtspr	SPRN_MAS4, r6
+	stw	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r8
+	/* Enable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	rlwinm	r3, r3, 0, ~SPRN_EPCR_DMIUH
+	mtspr	SPRN_EPCR, r3
+	isync
+
+	/* Switch to kernel stack and jump to handler. */
+	mr	r3, r4
+	mr	r5, r14 /* intno */
+	mr	r14, r4 /* Save vcpu pointer. */
+	mr	r4, r5
+	bl	kvmppc_handle_exit
+
+	/* Restore vcpu pointer and the nonvolatiles we used. */
+	mr	r4, r14
+	PPC_LL	r14, VCPU_GPR(R14)(r4)
+
+	andi.	r5, r3, RESUME_FLAG_NV
+	beq	skip_nv_load
+	PPC_LL	r15, VCPU_GPR(R15)(r4)
+	PPC_LL	r16, VCPU_GPR(R16)(r4)
+	PPC_LL	r17, VCPU_GPR(R17)(r4)
+	PPC_LL	r18, VCPU_GPR(R18)(r4)
+	PPC_LL	r19, VCPU_GPR(R19)(r4)
+	PPC_LL	r20, VCPU_GPR(R20)(r4)
+	PPC_LL	r21, VCPU_GPR(R21)(r4)
+	PPC_LL	r22, VCPU_GPR(R22)(r4)
+	PPC_LL	r23, VCPU_GPR(R23)(r4)
+	PPC_LL	r24, VCPU_GPR(R24)(r4)
+	PPC_LL	r25, VCPU_GPR(R25)(r4)
+	PPC_LL	r26, VCPU_GPR(R26)(r4)
+	PPC_LL	r27, VCPU_GPR(R27)(r4)
+	PPC_LL	r28, VCPU_GPR(R28)(r4)
+	PPC_LL	r29, VCPU_GPR(R29)(r4)
+	PPC_LL	r30, VCPU_GPR(R30)(r4)
+	PPC_LL	r31, VCPU_GPR(R31)(r4)
+skip_nv_load:
+	/* Should we return to the guest? */
+	andi.	r5, r3, RESUME_FLAG_HOST
+	beq	lightweight_exit
+
+	srawi	r3, r3, 2 /* Shift -ERR back down. */
+
+heavyweight_exit:
+	/* Not returning to guest. */
+	PPC_LL	r5, HOST_STACK_LR(r1)
+	lwz	r6, HOST_CR(r1)
+
+	/*
+	 * We already saved guest volatile register state; now save the
+	 * non-volatiles.
+	 */
+
+	PPC_STL	r15, VCPU_GPR(R15)(r4)
+	PPC_STL	r16, VCPU_GPR(R16)(r4)
+	PPC_STL	r17, VCPU_GPR(R17)(r4)
+	PPC_STL	r18, VCPU_GPR(R18)(r4)
+	PPC_STL	r19, VCPU_GPR(R19)(r4)
+	PPC_STL	r20, VCPU_GPR(R20)(r4)
+	PPC_STL	r21, VCPU_GPR(R21)(r4)
+	PPC_STL	r22, VCPU_GPR(R22)(r4)
+	PPC_STL	r23, VCPU_GPR(R23)(r4)
+	PPC_STL	r24, VCPU_GPR(R24)(r4)
+	PPC_STL	r25, VCPU_GPR(R25)(r4)
+	PPC_STL	r26, VCPU_GPR(R26)(r4)
+	PPC_STL	r27, VCPU_GPR(R27)(r4)
+	PPC_STL	r28, VCPU_GPR(R28)(r4)
+	PPC_STL	r29, VCPU_GPR(R29)(r4)
+	PPC_STL	r30, VCPU_GPR(R30)(r4)
+	PPC_STL	r31, VCPU_GPR(R31)(r4)
+
+	/* Load host non-volatile register state from host stack. */
+	PPC_LL	r14, HOST_NV_GPR(R14)(r1)
+	PPC_LL	r15, HOST_NV_GPR(R15)(r1)
+	PPC_LL	r16, HOST_NV_GPR(R16)(r1)
+	PPC_LL	r17, HOST_NV_GPR(R17)(r1)
+	PPC_LL	r18, HOST_NV_GPR(R18)(r1)
+	PPC_LL	r19, HOST_NV_GPR(R19)(r1)
+	PPC_LL	r20, HOST_NV_GPR(R20)(r1)
+	PPC_LL	r21, HOST_NV_GPR(R21)(r1)
+	PPC_LL	r22, HOST_NV_GPR(R22)(r1)
+	PPC_LL	r23, HOST_NV_GPR(R23)(r1)
+	PPC_LL	r24, HOST_NV_GPR(R24)(r1)
+	PPC_LL	r25, HOST_NV_GPR(R25)(r1)
+	PPC_LL	r26, HOST_NV_GPR(R26)(r1)
+	PPC_LL	r27, HOST_NV_GPR(R27)(r1)
+	PPC_LL	r28, HOST_NV_GPR(R28)(r1)
+	PPC_LL	r29, HOST_NV_GPR(R29)(r1)
+	PPC_LL	r30, HOST_NV_GPR(R30)(r1)
+	PPC_LL	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Return to kvm_vcpu_run(). */
+	mtlr	r5
+	mtcr	r6
+	addi	r1, r1, HOST_STACK_SIZE
+	/* r3 still contains the return code from kvmppc_handle_exit(). */
+	blr
+
+/* Registers:
+ *  r3: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+	stwu	r1, -HOST_STACK_SIZE(r1)
+	PPC_STL	r1, VCPU_HOST_STACK(r3)	/* Save stack pointer to vcpu. */
+
+	/* Save host state to stack. */
+	mr	r4, r3
+	mflr	r3
+	mfcr	r5
+	PPC_STL	r3, HOST_STACK_LR(r1)
+
+	stw	r5, HOST_CR(r1)
+
+	/* Save host non-volatile register state to stack. */
+	PPC_STL	r14, HOST_NV_GPR(R14)(r1)
+	PPC_STL	r15, HOST_NV_GPR(R15)(r1)
+	PPC_STL	r16, HOST_NV_GPR(R16)(r1)
+	PPC_STL	r17, HOST_NV_GPR(R17)(r1)
+	PPC_STL	r18, HOST_NV_GPR(R18)(r1)
+	PPC_STL	r19, HOST_NV_GPR(R19)(r1)
+	PPC_STL	r20, HOST_NV_GPR(R20)(r1)
+	PPC_STL	r21, HOST_NV_GPR(R21)(r1)
+	PPC_STL	r22, HOST_NV_GPR(R22)(r1)
+	PPC_STL	r23, HOST_NV_GPR(R23)(r1)
+	PPC_STL	r24, HOST_NV_GPR(R24)(r1)
+	PPC_STL	r25, HOST_NV_GPR(R25)(r1)
+	PPC_STL	r26, HOST_NV_GPR(R26)(r1)
+	PPC_STL	r27, HOST_NV_GPR(R27)(r1)
+	PPC_STL	r28, HOST_NV_GPR(R28)(r1)
+	PPC_STL	r29, HOST_NV_GPR(R29)(r1)
+	PPC_STL	r30, HOST_NV_GPR(R30)(r1)
+	PPC_STL	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Load guest non-volatiles. */
+	PPC_LL	r14, VCPU_GPR(R14)(r4)
+	PPC_LL	r15, VCPU_GPR(R15)(r4)
+	PPC_LL	r16, VCPU_GPR(R16)(r4)
+	PPC_LL	r17, VCPU_GPR(R17)(r4)
+	PPC_LL	r18, VCPU_GPR(R18)(r4)
+	PPC_LL	r19, VCPU_GPR(R19)(r4)
+	PPC_LL	r20, VCPU_GPR(R20)(r4)
+	PPC_LL	r21, VCPU_GPR(R21)(r4)
+	PPC_LL	r22, VCPU_GPR(R22)(r4)
+	PPC_LL	r23, VCPU_GPR(R23)(r4)
+	PPC_LL	r24, VCPU_GPR(R24)(r4)
+	PPC_LL	r25, VCPU_GPR(R25)(r4)
+	PPC_LL	r26, VCPU_GPR(R26)(r4)
+	PPC_LL	r27, VCPU_GPR(R27)(r4)
+	PPC_LL	r28, VCPU_GPR(R28)(r4)
+	PPC_LL	r29, VCPU_GPR(R29)(r4)
+	PPC_LL	r30, VCPU_GPR(R30)(r4)
+	PPC_LL	r31, VCPU_GPR(R31)(r4)
+
+
+lightweight_exit:
+	PPC_STL	r2, HOST_R2(r1)
+
+	mfspr	r3, SPRN_PID
+	stw	r3, VCPU_HOST_PID(r4)
+	lwz	r3, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r3
+
+	PPC_LL	r11, VCPU_SHARED(r4)
+	/* Disable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	oris	r3, r3, SPRN_EPCR_DMIUH@h
+	mtspr	SPRN_EPCR, r3
+	isync
+	/* Save host mas4 and mas6 and load guest MAS registers */
+	mfspr	r3, SPRN_MAS4
+	stw	r3, VCPU_HOST_MAS4(r4)
+	mfspr	r3, SPRN_MAS6
+	stw	r3, VCPU_HOST_MAS6(r4)
+	lwz	r3, VCPU_SHARED_MAS0(r11)
+	lwz	r5, VCPU_SHARED_MAS1(r11)
+	PPC_LD(r6, VCPU_SHARED_MAS2, r11)
+	lwz	r7, VCPU_SHARED_MAS7_3+4(r11)
+	lwz	r8, VCPU_SHARED_MAS4(r11)
+	mtspr	SPRN_MAS0, r3
+	mtspr	SPRN_MAS1, r5
+	mtspr	SPRN_MAS2, r6
+	mtspr	SPRN_MAS3, r7
+	mtspr	SPRN_MAS4, r8
+	lwz	r3, VCPU_SHARED_MAS6(r11)
+	lwz	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r3
+	mtspr	SPRN_MAS7, r5
+
+	/*
+	 * Host interrupt handlers may have clobbered these guest-readable
+	 * SPRGs, so we need to reload them here with the guest's values.
+	 */
+	lwz	r3, VCPU_VRSAVE(r4)
+	PPC_LD(r5, VCPU_SHARED_SPRG4, r11)
+	mtspr	SPRN_VRSAVE, r3
+	PPC_LD(r6, VCPU_SHARED_SPRG5, r11)
+	mtspr	SPRN_SPRG4W, r5
+	PPC_LD(r7, VCPU_SHARED_SPRG6, r11)
+	mtspr	SPRN_SPRG5W, r6
+	PPC_LD(r8, VCPU_SHARED_SPRG7, r11)
+	mtspr	SPRN_SPRG6W, r7
+	PPC_LD(r5, VCPU_SPRG9, r4)
+	mtspr	SPRN_SPRG7W, r8
+	mtspr	SPRN_SPRG9, r5
+
+	/* Load some guest volatiles. */
+	PPC_LL	r3, VCPU_LR(r4)
+	PPC_LL	r5, VCPU_XER(r4)
+	PPC_LL	r6, VCPU_CTR(r4)
+	PPC_LL	r7, VCPU_CR(r4)
+	PPC_LL	r8, VCPU_PC(r4)
+	PPC_LD(r9, VCPU_SHARED_MSR, r11)
+	PPC_LL	r0, VCPU_GPR(R0)(r4)
+	PPC_LL	r1, VCPU_GPR(R1)(r4)
+	PPC_LL	r2, VCPU_GPR(R2)(r4)
+	PPC_LL	r10, VCPU_GPR(R10)(r4)
+	PPC_LL	r11, VCPU_GPR(R11)(r4)
+	PPC_LL	r12, VCPU_GPR(R12)(r4)
+	PPC_LL	r13, VCPU_GPR(R13)(r4)
+	mtlr	r3
+	mtxer	r5
+	mtctr	r6
+	mtsrr0	r8
+	mtsrr1	r9
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r9, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	stw	r9, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	bne	1b
+	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
+	/*
+	 * Don't execute any instruction which can change CR after
+	 * below instruction.
+	 */
+	mtcr	r7
+
+	/* Finish loading guest volatiles and jump to guest. */
+	PPC_LL	r5, VCPU_GPR(R5)(r4)
+	PPC_LL	r6, VCPU_GPR(R6)(r4)
+	PPC_LL	r7, VCPU_GPR(R7)(r4)
+	PPC_LL	r8, VCPU_GPR(R8)(r4)
+	PPC_LL	r9, VCPU_GPR(R9)(r4)
+
+	PPC_LL	r3, VCPU_GPR(R3)(r4)
+	PPC_LL	r4, VCPU_GPR(R4)(r4)
+	rfi
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
new file mode 100644
index 0000000000..b0f6954287
--- /dev/null
+++ b/arch/powerpc/kvm/e500.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+
+#include "../mm/mmu_decl.h"
+#include "booke.h"
+#include "e500.h"
+
+struct id {
+	unsigned long val;
+	struct id **pentry;
+};
+
+#define NUM_TIDS 256
+
+/*
+ * This table provide mappings from:
+ * (guestAS,guestTID,guestPR) --> ID of physical cpu
+ * guestAS	[0..1]
+ * guestTID	[0..255]
+ * guestPR	[0..1]
+ * ID		[1..255]
+ * Each vcpu keeps one vcpu_id_table.
+ */
+struct vcpu_id_table {
+	struct id id[2][NUM_TIDS][2];
+};
+
+/*
+ * This table provide reversed mappings of vcpu_id_table:
+ * ID --> address of vcpu_id_table item.
+ * Each physical core has one pcpu_id_table.
+ */
+struct pcpu_id_table {
+	struct id *entry[NUM_TIDS];
+};
+
+static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
+
+/* This variable keeps last used shadow ID on local core.
+ * The valid range of shadow ID is [1..255] */
+static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
+
+/*
+ * Allocate a free shadow id and setup a valid sid mapping in given entry.
+ * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_setup_one(struct id *entry)
+{
+	unsigned long sid;
+	int ret = -1;
+
+	sid = __this_cpu_inc_return(pcpu_last_used_sid);
+	if (sid < NUM_TIDS) {
+		__this_cpu_write(pcpu_sids.entry[sid], entry);
+		entry->val = sid;
+		entry->pentry = this_cpu_ptr(&pcpu_sids.entry[sid]);
+		ret = sid;
+	}
+
+	/*
+	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
+	 * the caller will invalidate everything and start over.
+	 *
+	 * sid > NUM_TIDS indicates a race, which we disable preemption to
+	 * avoid.
+	 */
+	WARN_ON(sid > NUM_TIDS);
+
+	return ret;
+}
+
+/*
+ * Check if given entry contain a valid shadow id mapping.
+ * An ID mapping is considered valid only if
+ * both vcpu and pcpu know this mapping.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_lookup(struct id *entry)
+{
+	if (entry && entry->val != 0 &&
+	    __this_cpu_read(pcpu_sids.entry[entry->val]) == entry &&
+	    entry->pentry == this_cpu_ptr(&pcpu_sids.entry[entry->val]))
+		return entry->val;
+	return -1;
+}
+
+/* Invalidate all id mappings on local core -- call with preempt disabled */
+static inline void local_sid_destroy_all(void)
+{
+	__this_cpu_write(pcpu_last_used_sid, 0);
+	memset(this_cpu_ptr(&pcpu_sids), 0, sizeof(pcpu_sids));
+}
+
+static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
+	return vcpu_e500->idt;
+}
+
+static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->idt);
+	vcpu_e500->idt = NULL;
+}
+
+/* Map guest pid to shadow.
+ * We use PID to keep shadow of current guest non-zero PID,
+ * and use PID1 to keep shadow of guest zero PID.
+ * So that guest tlbe with TID=0 can be accessed at any time */
+static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	preempt_disable();
+	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu),
+			get_cur_pid(&vcpu_e500->vcpu),
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu), 0,
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	preempt_enable();
+}
+
+/* Invalidate all mappings on vcpu */
+static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/* Invalidate one ID mapping on vcpu */
+static inline void kvmppc_e500_id_table_reset_one(
+			       struct kvmppc_vcpu_e500 *vcpu_e500,
+			       int as, int pid, int pr)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+
+	BUG_ON(as >= 2);
+	BUG_ON(pid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	idt->id[as][pid][pr].val = 0;
+	idt->id[as][pid][pr].pentry = NULL;
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/*
+ * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
+ * This function first lookup if a valid mapping exists,
+ * if not, then creates a new one.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	int sid;
+
+	BUG_ON(as >= 2);
+	BUG_ON(gid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	sid = local_sid_lookup(&idt->id[as][gid][pr]);
+
+	while (sid <= 0) {
+		/* No mapping yet */
+		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
+		if (sid <= 0) {
+			_tlbil_all();
+			local_sid_destroy_all();
+		}
+
+		/* Update shadow pid when mappings are changed */
+		if (!avoid_recursion)
+			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+
+	return sid;
+}
+
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe),
+				   get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (vcpu->arch.pid != pid) {
+		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
+		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+}
+
+/* gtlbe must not be mapped by more than one host tlbe */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+                           struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	unsigned int pr, tid, ts;
+	int pid;
+	u32 val, eaddr;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	preempt_disable();
+
+	/* One guest ID may be mapped to two shadow IDs */
+	for (pr = 0; pr < 2; pr++) {
+		/*
+		 * The shadow PID can have a valid mapping on at most one
+		 * host CPU.  In the common case, it will be valid on this
+		 * CPU, in which case we do a local invalidation of the
+		 * specific address.
+		 *
+		 * If the shadow PID is not valid on the current host CPU,
+		 * we invalidate the entire shadow PID.
+		 */
+		pid = local_sid_lookup(&idt->id[ts][tid][pr]);
+		if (pid <= 0) {
+			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
+			continue;
+		}
+
+		/*
+		 * The guest is invalidating a 4K entry which is in a PID
+		 * that has a valid shadow mapping on this host CPU.  We
+		 * search host TLB to invalidate it's shadow TLB entry,
+		 * similar to __tlbil_va except that we need to look in AS1.
+		 */
+		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
+		eaddr = get_tlb_eaddr(gtlbe);
+
+		local_irq_save(flags);
+
+		mtspr(SPRN_MAS6, val);
+		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
+		val = mfspr(SPRN_MAS1);
+		if (val & MAS1_VALID) {
+			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+			asm volatile("tlbwe");
+		}
+
+		local_irq_restore(flags);
+	}
+
+	preempt_enable();
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+	/* Recalc shadow pid since MSR changes */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
+}
+
+static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu)
+{
+	kvmppc_booke_vcpu_load(vcpu, cpu);
+
+	/* Shadow PID may be expired on local core */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
+}
+
+static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_SPE
+	if (vcpu->arch.shadow_msr & MSR_SPE)
+		kvmppc_vcpu_disable_spe(vcpu);
+#endif
+
+	kvmppc_booke_vcpu_put(vcpu);
+}
+
+static int kvmppc_e500_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500v2") == 0)
+		r = 0;
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+
+	/* Insert large initial mapping for guest. */
+	tlbe = get_entry(vcpu_e500, 1, 0);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
+	tlbe->mas2 = 0;
+	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
+
+	/* 4K map for serial output. Used by kernel wrapper. */
+	tlbe = get_entry(vcpu_e500, 1, 1);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
+	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_e500_tlb_setup(vcpu_e500);
+
+	/* Registers init */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu_e500->svr = mfspr(SPRN_SVR);
+
+	vcpu->arch.cpu_type = KVM_CPU_E500V2;
+
+	return 0;
+}
+
+static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu,
+				      struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_SPE |
+	                       KVM_SREGS_E_PM;
+	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL;
+
+	sregs->u.e.impl.fsl.features = 0;
+	sregs->u.e.impl.fsl.svr = vcpu_e500->svr;
+	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
+	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
+
+	sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+	sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+	sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+	sregs->u.e.ivor_high[3] =
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+
+	kvmppc_get_sregs_ivor(vcpu, sregs);
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+	return 0;
+}
+
+static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu,
+				      struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
+
+	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
+		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
+		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0;
+		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
+	}
+
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	if (sregs->u.e.features & KVM_SREGS_E_SPE) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] =
+			sregs->u.e.ivor_high[0];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] =
+			sregs->u.e.ivor_high[1];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] =
+			sregs->u.e.ivor_high[2];
+	}
+
+	if (sregs->u.e.features & KVM_SREGS_E_PM) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] =
+			sregs->u.e.ivor_high[3];
+	}
+
+	return kvmppc_set_sregs_ivor(vcpu, sregs);
+}
+
+static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id,
+				   union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
+static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id,
+				   union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
+static int kvmppc_core_vcpu_create_e500(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	int err;
+
+	BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0);
+	vcpu_e500 = to_e500(vcpu);
+
+	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
+		return -ENOMEM;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		goto uninit_id;
+
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared) {
+		err = -ENOMEM;
+		goto uninit_tlb;
+	}
+
+	return 0;
+
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_id:
+	kvmppc_e500_id_table_free(vcpu_e500);
+	return err;
+}
+
+static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared);
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvmppc_e500_id_table_free(vcpu_e500);
+}
+
+static int kvmppc_core_init_vm_e500(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void kvmppc_core_destroy_vm_e500(struct kvm *kvm)
+{
+}
+
+static struct kvmppc_ops kvm_ops_e500 = {
+	.get_sregs = kvmppc_core_get_sregs_e500,
+	.set_sregs = kvmppc_core_set_sregs_e500,
+	.get_one_reg = kvmppc_get_one_reg_e500,
+	.set_one_reg = kvmppc_set_one_reg_e500,
+	.vcpu_load   = kvmppc_core_vcpu_load_e500,
+	.vcpu_put    = kvmppc_core_vcpu_put_e500,
+	.vcpu_create = kvmppc_core_vcpu_create_e500,
+	.vcpu_free   = kvmppc_core_vcpu_free_e500,
+	.init_vm = kvmppc_core_init_vm_e500,
+	.destroy_vm = kvmppc_core_destroy_vm_e500,
+	.emulate_op = kvmppc_core_emulate_op_e500,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_e500,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_e500,
+	.create_vcpu_debugfs = kvmppc_create_vcpu_debugfs_e500,
+};
+
+static int __init kvmppc_e500_init(void)
+{
+	int r, i;
+	unsigned long ivor[3];
+	/* Process remaining handlers above the generic first 16 */
+	unsigned long *handler = &kvmppc_booke_handler_addr[16];
+	unsigned long handler_len;
+	unsigned long max_ivor = 0;
+
+	r = kvmppc_e500_check_processor_compat();
+	if (r)
+		goto err_out;
+
+	r = kvmppc_booke_init();
+	if (r)
+		goto err_out;
+
+	/* copy extra E500 exception handlers */
+	ivor[0] = mfspr(SPRN_IVOR32);
+	ivor[1] = mfspr(SPRN_IVOR33);
+	ivor[2] = mfspr(SPRN_IVOR34);
+	for (i = 0; i < 3; i++) {
+		if (ivor[i] > ivor[max_ivor])
+			max_ivor = i;
+
+		handler_len = handler[i + 1] - handler[i];
+		memcpy((void *)kvmppc_booke_handlers + ivor[i],
+		       (void *)handler[i], handler_len);
+	}
+	handler_len = handler[max_ivor + 1] - handler[max_ivor];
+	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
+			   ivor[max_ivor] + handler_len);
+
+	r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	if (r)
+		goto err_out;
+	kvm_ops_e500.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_e500;
+
+err_out:
+	return r;
+}
+
+static void __exit kvmppc_e500_exit(void)
+{
+	kvmppc_pr_ops = NULL;
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500_init);
+module_exit(kvmppc_e500_exit);
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
new file mode 100644
index 0000000000..6d0d329cbb
--- /dev/null
+++ b/arch/powerpc/kvm/e500.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu <yu.liu@freescale.com>
+ *         Scott Wood <scottwood@freescale.com>
+ *         Ashish Kalra <ashish.kalra@freescale.com>
+ *         Varun Sethi <varun.sethi@freescale.com>
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.h and
+ * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>,
+ * Copyright IBM Corp. 2007-2008
+ */
+
+#ifndef KVM_E500_H
+#define KVM_E500_H
+
+#include <linux/kvm_host.h>
+#include <asm/nohash/mmu-e500.h>
+#include <asm/tlb.h>
+#include <asm/cputhreads.h>
+
+enum vcpu_ftr {
+	VCPU_FTR_MMU_V2
+};
+
+#define E500_PID_NUM   3
+#define E500_TLB_NUM   2
+
+/* entry is mapped somewhere in host TLB */
+#define E500_TLB_VALID		(1 << 31)
+/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */
+#define E500_TLB_BITMAP		(1 << 30)
+/* TLB1 entry is mapped by host TLB0 */
+#define E500_TLB_TLB0		(1 << 29)
+/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */
+#define E500_TLB_MAS2_ATTR	(0x7f)
+
+struct tlbe_ref {
+	kvm_pfn_t pfn;		/* valid only for TLB0, except briefly */
+	unsigned int flags;	/* E500_TLB_* */
+};
+
+struct tlbe_priv {
+	struct tlbe_ref ref;
+};
+
+#ifdef CONFIG_KVM_E500V2
+struct vcpu_id_table;
+#endif
+
+struct kvmppc_e500_tlb_params {
+	int entries, ways, sets;
+};
+
+struct kvmppc_vcpu_e500 {
+	struct kvm_vcpu vcpu;
+
+	/* Unmodified copy of the guest's TLB -- shared with host userspace. */
+	struct kvm_book3e_206_tlb_entry *gtlb_arch;
+
+	/* Starting entry number in gtlb_arch[] */
+	int gtlb_offset[E500_TLB_NUM];
+
+	/* KVM internal information associated with each guest TLB entry */
+	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
+
+	struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM];
+
+	unsigned int gtlb_nv[E500_TLB_NUM];
+
+	unsigned int host_tlb1_nv;
+
+	u32 svr;
+	u32 l1csr0;
+	u32 l1csr1;
+	u32 hid0;
+	u32 hid1;
+	u64 mcar;
+
+	struct page **shared_tlb_pages;
+	int num_shared_tlb_pages;
+
+	u64 *g2h_tlb1_map;
+	unsigned int *h2g_tlb1_rmap;
+
+	/* Minimum and maximum address mapped my TLB1 */
+	unsigned long tlb1_min_eaddr;
+	unsigned long tlb1_max_eaddr;
+
+#ifdef CONFIG_KVM_E500V2
+	u32 pid[E500_PID_NUM];
+
+	/* vcpu id table */
+	struct vcpu_id_table *idt;
+#endif
+};
+
+static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
+}
+
+
+/* This geometry is the legacy default -- can be overridden by userspace */
+#define KVM_E500_TLB0_WAY_SIZE		128
+#define KVM_E500_TLB0_WAY_NUM		2
+
+#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
+#define KVM_E500_TLB1_SIZE  16
+
+#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF))
+#define tlbsel_of(index)	((index) >> 16)
+#define esel_of(index)		((index) & 0xFFFF)
+
+#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
+#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
+#define MAS2_ATTRIB_MASK \
+	  (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G)
+#define MAS3_ATTRIB_MASK \
+	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
+	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
+
+int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
+				ulong value);
+int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
+int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea);
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea);
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea);
+int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
+void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+				union kvmppc_one_reg *val);
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+			       union kvmppc_one_reg *val);
+
+#ifdef CONFIG_KVM_E500V2
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion);
+#endif
+
+/* TLB helper functions */
+static inline unsigned int
+get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 7) & 0x1f;
+}
+
+static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas2 & MAS2_EPN;
+}
+
+static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	unsigned int pgsize = get_tlb_size(tlbe);
+	return 1ULL << 10 << pgsize;
+}
+
+static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	u64 bytes = get_tlb_bytes(tlbe);
+	return get_tlb_eaddr(tlbe) + bytes - 1;
+}
+
+static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas7_3 & ~0xfffULL;
+}
+
+static inline unsigned int
+get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 16) & 0xff;
+}
+
+static inline unsigned int
+get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 12) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 31) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 30) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
+}
+
+static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pid & 0xff;
+}
+
+static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
+}
+
+static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & MSR_PR);
+}
+
+static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->mas6 >> 16) & 0xff;
+}
+
+static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->mas6 & 0x1;
+}
+
+static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Manual says that tlbsel has 2 bits wide.
+	 * Since we only have two TLBs, only lower bit is used.
+	 */
+	return (vcpu->arch.shared->mas0 >> 28) & 0x1;
+}
+
+static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->mas0 & 0xfff;
+}
+
+static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->mas0 >> 16) & 0xfff;
+}
+
+static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+			const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	gpa_t gpa;
+
+	if (!get_tlb_v(tlbe))
+		return 0;
+
+#ifndef CONFIG_KVM_BOOKE_HV
+	/* Does it match current guest AS? */
+	/* XXX what about IS != DS? */
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
+		return 0;
+#endif
+
+	gpa = get_tlb_raddr(tlbe);
+	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+		/* Mapping is not for RAM. */
+		return 0;
+
+	return 1;
+}
+
+static inline struct kvm_book3e_206_tlb_entry *get_entry(
+	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
+{
+	int offset = vcpu_e500->gtlb_offset[tlbsel];
+	return &vcpu_e500->gtlb_arch[offset + entry];
+}
+
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe);
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe)       get_tlb_tid(gtlbe)
+#define get_tlbmiss_tid(vcpu)           get_cur_pid(vcpu)
+#define get_tlb_sts(gtlbe)              (gtlbe->mas1 & MAS1_TS)
+
+/*
+ * These functions should be called with preemption disabled
+ * and the returned value is valid only in that context
+ */
+static inline int get_thread_specific_lpid(int vm_lpid)
+{
+	int vcpu_lpid = vm_lpid;
+
+	if (threads_per_core == 2)
+		vcpu_lpid |= smp_processor_id() & 1;
+
+	return vcpu_lpid;
+}
+
+static inline int get_lpid(struct kvm_vcpu *vcpu)
+{
+	return get_thread_specific_lpid(vcpu->kvm->arch.lpid);
+}
+#else
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe);
+
+static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf;
+
+	return vcpu_e500->pid[tidseld];
+}
+
+/* Force TS=1 for all guest mappings. */
+#define get_tlb_sts(gtlbe)              (MAS1_TS)
+#endif /* !BOOKE_HV */
+
+static inline bool has_feature(const struct kvm_vcpu *vcpu,
+			       enum vcpu_ftr ftr)
+{
+	bool has_ftr;
+	switch (ftr) {
+	case VCPU_FTR_MMU_V2:
+		has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2);
+		break;
+	default:
+		return false;
+	}
+	return has_ftr;
+}
+
+#endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
new file mode 100644
index 0000000000..051102d50c
--- /dev/null
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x_emulate.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/dbell.h>
+#include <asm/reg_booke.h>
+
+#include "booke.h"
+#include "e500.h"
+
+#define XOP_DCBTLS  166
+#define XOP_MSGSND  206
+#define XOP_MSGCLR  238
+#define XOP_MFTMR   366
+#define XOP_TLBIVAX 786
+#define XOP_TLBSX   914
+#define XOP_TLBRE   946
+#define XOP_TLBWE   978
+#define XOP_TLBILX  18
+#define XOP_EHPRIV  270
+
+#ifdef CONFIG_KVM_E500MC
+static int dbell2prio(ulong param)
+{
+	int msg = param & PPC_DBELL_TYPE_MASK;
+	int prio = -1;
+
+	switch (msg) {
+	case PPC_DBELL_TYPE(PPC_DBELL):
+		prio = BOOKE_IRQPRIO_DBELL;
+		break;
+	case PPC_DBELL_TYPE(PPC_DBELL_CRIT):
+		prio = BOOKE_IRQPRIO_DBELL_CRIT;
+		break;
+	default:
+		break;
+	}
+
+	return prio;
+}
+
+static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb)
+{
+	ulong param = vcpu->arch.regs.gpr[rb];
+	int prio = dbell2prio(param);
+
+	if (prio < 0)
+		return EMULATE_FAIL;
+
+	clear_bit(prio, &vcpu->arch.pending_exceptions);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb)
+{
+	ulong param = vcpu->arch.regs.gpr[rb];
+	int prio = dbell2prio(rb);
+	int pir = param & PPC_DBELL_PIR_MASK;
+	unsigned long i;
+	struct kvm_vcpu *cvcpu;
+
+	if (prio < 0)
+		return EMULATE_FAIL;
+
+	kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) {
+		int cpir = cvcpu->arch.shared->pir;
+		if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) {
+			set_bit(prio, &cvcpu->arch.pending_exceptions);
+			kvm_vcpu_kick(cvcpu);
+		}
+	}
+
+	return EMULATE_DONE;
+}
+#endif
+
+static int kvmppc_e500_emul_ehpriv(struct kvm_vcpu *vcpu,
+				   unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (get_oc(inst)) {
+	case EHPRIV_OC_DEBUG:
+		vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+		vcpu->run->debug.arch.address = vcpu->arch.regs.nip;
+		vcpu->run->debug.arch.status = 0;
+		kvmppc_account_exit(vcpu, DEBUG_EXITS);
+		emulated = EMULATE_EXIT_USER;
+		*advance = 0;
+		break;
+	default:
+		emulated = EMULATE_FAIL;
+	}
+	return emulated;
+}
+
+static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	/* Always fail to lock the cache */
+	vcpu_e500->l1csr0 |= L1CSR0_CUL;
+	return EMULATE_DONE;
+}
+
+static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst,
+				  int rt)
+{
+	/* Expose one thread per vcpu */
+	if (get_tmrn(inst) == TMRN_TMCFG0) {
+		kvmppc_set_gpr(vcpu, rt,
+			       1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT));
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_op_e500(struct kvm_vcpu *vcpu,
+				unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
+	int rt = get_rt(inst);
+	gva_t ea;
+
+	switch (get_op(inst)) {
+	case 31:
+		switch (get_xop(inst)) {
+
+		case XOP_DCBTLS:
+			emulated = kvmppc_e500_emul_dcbtls(vcpu);
+			break;
+
+#ifdef CONFIG_KVM_E500MC
+		case XOP_MSGSND:
+			emulated = kvmppc_e500_emul_msgsnd(vcpu, rb);
+			break;
+
+		case XOP_MSGCLR:
+			emulated = kvmppc_e500_emul_msgclr(vcpu, rb);
+			break;
+#endif
+
+		case XOP_TLBRE:
+			emulated = kvmppc_e500_emul_tlbre(vcpu);
+			break;
+
+		case XOP_TLBWE:
+			emulated = kvmppc_e500_emul_tlbwe(vcpu);
+			break;
+
+		case XOP_TLBSX:
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbsx(vcpu, ea);
+			break;
+
+		case XOP_TLBILX: {
+			int type = rt & 0x3;
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);
+			break;
+		}
+
+		case XOP_TLBIVAX:
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
+			break;
+
+		case XOP_MFTMR:
+			emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt);
+			break;
+
+		case XOP_EHPRIV:
+			emulated = kvmppc_e500_emul_ehpriv(vcpu, inst, advance);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL)
+		emulated = kvmppc_booke_emulate_op(vcpu, inst, advance);
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
+	case SPRN_PID:
+		kvmppc_set_pid(vcpu, spr_val);
+		break;
+	case SPRN_PID1:
+		if (spr_val != 0)
+			return EMULATE_FAIL;
+		vcpu_e500->pid[1] = spr_val;
+		break;
+	case SPRN_PID2:
+		if (spr_val != 0)
+			return EMULATE_FAIL;
+		vcpu_e500->pid[2] = spr_val;
+		break;
+	case SPRN_MAS0:
+		vcpu->arch.shared->mas0 = spr_val;
+		break;
+	case SPRN_MAS1:
+		vcpu->arch.shared->mas1 = spr_val;
+		break;
+	case SPRN_MAS2:
+		vcpu->arch.shared->mas2 = spr_val;
+		break;
+	case SPRN_MAS3:
+		vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff;
+		vcpu->arch.shared->mas7_3 |= spr_val;
+		break;
+	case SPRN_MAS4:
+		vcpu->arch.shared->mas4 = spr_val;
+		break;
+	case SPRN_MAS6:
+		vcpu->arch.shared->mas6 = spr_val;
+		break;
+	case SPRN_MAS7:
+		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
+		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
+		break;
+#endif
+	case SPRN_L1CSR0:
+		vcpu_e500->l1csr0 = spr_val;
+		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
+		break;
+	case SPRN_L1CSR1:
+		vcpu_e500->l1csr1 = spr_val;
+		vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR);
+		break;
+	case SPRN_HID0:
+		vcpu_e500->hid0 = spr_val;
+		break;
+	case SPRN_HID1:
+		vcpu_e500->hid1 = spr_val;
+		break;
+
+	case SPRN_MMUCSR0:
+		emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500,
+				spr_val);
+		break;
+
+	case SPRN_PWRMGTCR0:
+		/*
+		 * Guest relies on host power management configurations
+		 * Treat the request as a general store
+		 */
+		vcpu->arch.pwrmgtcr0 = spr_val;
+		break;
+
+	case SPRN_BUCSR:
+		/*
+		 * If we are here, it means that we have already flushed the
+		 * branch predictor, so just return to guest.
+		 */
+		break;
+
+	/* extra exceptions */
+#ifdef CONFIG_SPE_POSSIBLE
+	case SPRN_IVOR32:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR33:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val;
+		break;
+	case SPRN_IVOR34:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val;
+		break;
+#endif
+#ifdef CONFIG_ALTIVEC
+	case SPRN_IVOR32:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR33:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST] = spr_val;
+		break;
+#endif
+	case SPRN_IVOR35:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;
+		break;
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val;
+		break;
+	case SPRN_IVOR37:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val;
+		break;
+#endif
+	default:
+		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val);
+	}
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
+	case SPRN_PID:
+		*spr_val = vcpu_e500->pid[0];
+		break;
+	case SPRN_PID1:
+		*spr_val = vcpu_e500->pid[1];
+		break;
+	case SPRN_PID2:
+		*spr_val = vcpu_e500->pid[2];
+		break;
+	case SPRN_MAS0:
+		*spr_val = vcpu->arch.shared->mas0;
+		break;
+	case SPRN_MAS1:
+		*spr_val = vcpu->arch.shared->mas1;
+		break;
+	case SPRN_MAS2:
+		*spr_val = vcpu->arch.shared->mas2;
+		break;
+	case SPRN_MAS3:
+		*spr_val = (u32)vcpu->arch.shared->mas7_3;
+		break;
+	case SPRN_MAS4:
+		*spr_val = vcpu->arch.shared->mas4;
+		break;
+	case SPRN_MAS6:
+		*spr_val = vcpu->arch.shared->mas6;
+		break;
+	case SPRN_MAS7:
+		*spr_val = vcpu->arch.shared->mas7_3 >> 32;
+		break;
+#endif
+	case SPRN_DECAR:
+		*spr_val = vcpu->arch.decar;
+		break;
+	case SPRN_TLB0CFG:
+		*spr_val = vcpu->arch.tlbcfg[0];
+		break;
+	case SPRN_TLB1CFG:
+		*spr_val = vcpu->arch.tlbcfg[1];
+		break;
+	case SPRN_TLB0PS:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		*spr_val = vcpu->arch.tlbps[0];
+		break;
+	case SPRN_TLB1PS:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		*spr_val = vcpu->arch.tlbps[1];
+		break;
+	case SPRN_L1CSR0:
+		*spr_val = vcpu_e500->l1csr0;
+		break;
+	case SPRN_L1CSR1:
+		*spr_val = vcpu_e500->l1csr1;
+		break;
+	case SPRN_HID0:
+		*spr_val = vcpu_e500->hid0;
+		break;
+	case SPRN_HID1:
+		*spr_val = vcpu_e500->hid1;
+		break;
+	case SPRN_SVR:
+		*spr_val = vcpu_e500->svr;
+		break;
+
+	case SPRN_MMUCSR0:
+		*spr_val = 0;
+		break;
+
+	case SPRN_MMUCFG:
+		*spr_val = vcpu->arch.mmucfg;
+		break;
+	case SPRN_EPTCFG:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		/*
+		 * Legacy Linux guests access EPTCFG register even if the E.PT
+		 * category is disabled in the VM. Give them a chance to live.
+		 */
+		*spr_val = vcpu->arch.eptcfg;
+		break;
+
+	case SPRN_PWRMGTCR0:
+		*spr_val = vcpu->arch.pwrmgtcr0;
+		break;
+
+	/* extra exceptions */
+#ifdef CONFIG_SPE_POSSIBLE
+	case SPRN_IVOR32:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+		break;
+	case SPRN_IVOR33:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+		break;
+	case SPRN_IVOR34:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+		break;
+#endif
+#ifdef CONFIG_ALTIVEC
+	case SPRN_IVOR32:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL];
+		break;
+	case SPRN_IVOR33:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST];
+		break;
+#endif
+	case SPRN_IVOR35:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+		break;
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
+		break;
+	case SPRN_IVOR37:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
+		break;
+#endif
+	default:
+		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val);
+	}
+
+	return emulated;
+}
+
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
new file mode 100644
index 0000000000..e131fbecdc
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -0,0 +1,956 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *         Scott Wood, scottwood@freescale.com
+ *         Ashish Kalra, ashish.kalra@freescale.com
+ *         Varun Sethi, varun.sethi@freescale.com
+ *         Alexander Graf, agraf@suse.de
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <asm/kvm_ppc.h>
+
+#include "e500.h"
+#include "trace_booke.h"
+#include "timing.h"
+#include "e500_mmu_host.h"
+
+static inline unsigned int gtlb0_get_next_victim(
+		struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned int victim;
+
+	victim = vcpu_e500->gtlb_nv[0]++;
+	if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways))
+		vcpu_e500->gtlb_nv[0] = 0;
+
+	return victim;
+}
+
+static int tlb0_set_base(gva_t addr, int sets, int ways)
+{
+	int set_base;
+
+	set_base = (addr >> PAGE_SHIFT) & (sets - 1);
+	set_base *= ways;
+
+	return set_base;
+}
+
+static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr)
+{
+	return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets,
+			     vcpu_e500->gtlb_params[0].ways);
+}
+
+static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel = get_tlb_esel_bit(vcpu);
+
+	if (tlbsel == 0) {
+		esel &= vcpu_e500->gtlb_params[0].ways - 1;
+		esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2);
+	} else {
+		esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1;
+	}
+
+	return esel;
+}
+
+/* Search the guest TLB for a matching entry. */
+static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
+		gva_t eaddr, int tlbsel, unsigned int pid, int as)
+{
+	int size = vcpu_e500->gtlb_params[tlbsel].entries;
+	unsigned int set_base, offset;
+	int i;
+
+	if (tlbsel == 0) {
+		set_base = gtlb0_set_base(vcpu_e500, eaddr);
+		size = vcpu_e500->gtlb_params[0].ways;
+	} else {
+		if (eaddr < vcpu_e500->tlb1_min_eaddr ||
+				eaddr > vcpu_e500->tlb1_max_eaddr)
+			return -1;
+		set_base = 0;
+	}
+
+	offset = vcpu_e500->gtlb_offset[tlbsel];
+
+	for (i = 0; i < size; i++) {
+		struct kvm_book3e_206_tlb_entry *tlbe =
+			&vcpu_e500->gtlb_arch[offset + set_base + i];
+		unsigned int tid;
+
+		if (eaddr < get_tlb_eaddr(tlbe))
+			continue;
+
+		if (eaddr > get_tlb_end(tlbe))
+			continue;
+
+		tid = get_tlb_tid(tlbe);
+		if (tid && (tid != pid))
+			continue;
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		if (get_tlb_ts(tlbe) != as && as != -1)
+			continue;
+
+		return set_base + i;
+	}
+
+	return -1;
+}
+
+static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
+		gva_t eaddr, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int victim, tsized;
+	int tlbsel;
+
+	/* since we only have two TLBs, only lower bit is used. */
+	tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1;
+	victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
+	tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f;
+
+	vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
+		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+	vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
+		| MAS1_TID(get_tlbmiss_tid(vcpu))
+		| MAS1_TSIZE(tsized);
+	vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN)
+		| (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK);
+	vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+	vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1)
+		| (get_cur_pid(vcpu) << 16)
+		| (as ? MAS6_SAS : 0);
+}
+
+static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int size = vcpu_e500->gtlb_params[1].entries;
+	unsigned int offset;
+	gva_t eaddr;
+	int i;
+
+	vcpu_e500->tlb1_min_eaddr = ~0UL;
+	vcpu_e500->tlb1_max_eaddr = 0;
+	offset = vcpu_e500->gtlb_offset[1];
+
+	for (i = 0; i < size; i++) {
+		struct kvm_book3e_206_tlb_entry *tlbe =
+			&vcpu_e500->gtlb_arch[offset + i];
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		eaddr = get_tlb_eaddr(tlbe);
+		vcpu_e500->tlb1_min_eaddr =
+				min(vcpu_e500->tlb1_min_eaddr, eaddr);
+
+		eaddr = get_tlb_end(tlbe);
+		vcpu_e500->tlb1_max_eaddr =
+				max(vcpu_e500->tlb1_max_eaddr, eaddr);
+	}
+}
+
+static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500,
+				struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned long start, end, size;
+
+	size = get_tlb_bytes(gtlbe);
+	start = get_tlb_eaddr(gtlbe) & ~(size - 1);
+	end = start + size - 1;
+
+	return vcpu_e500->tlb1_min_eaddr == start ||
+			vcpu_e500->tlb1_max_eaddr == end;
+}
+
+/* This function is supposed to be called for a adding a new valid tlb entry */
+static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu,
+				struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned long start, end, size;
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (!get_tlb_v(gtlbe))
+		return;
+
+	size = get_tlb_bytes(gtlbe);
+	start = get_tlb_eaddr(gtlbe) & ~(size - 1);
+	end = start + size - 1;
+
+	vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start);
+	vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end);
+}
+
+static inline int kvmppc_e500_gtlbe_invalidate(
+				struct kvmppc_vcpu_e500 *vcpu_e500,
+				int tlbsel, int esel)
+{
+	struct kvm_book3e_206_tlb_entry *gtlbe =
+		get_entry(vcpu_e500, tlbsel, esel);
+
+	if (unlikely(get_tlb_iprot(gtlbe)))
+		return -1;
+
+	if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe))
+		kvmppc_recalc_tlb1map_range(vcpu_e500);
+
+	gtlbe->mas1 = 0;
+
+	return 0;
+}
+
+int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
+{
+	int esel;
+
+	if (value & MMUCSR0_TLB0FI)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
+	if (value & MMUCSR0_TLB1FI)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
+
+	/* Invalidate all host shadow mappings */
+	kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int ia;
+	int esel, tlbsel;
+
+	ia = (ea >> 2) & 0x1;
+
+	/* since we only have two TLBs, only lower bit is used. */
+	tlbsel = (ea >> 3) & 0x1;
+
+	if (ia) {
+		/* invalidate all entries */
+		for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries;
+		     esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+	} else {
+		ea &= 0xfffff000;
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel,
+				get_cur_pid(vcpu), -1);
+		if (esel >= 0)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+	}
+
+	/* Invalidate all host shadow mappings */
+	kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
+
+	return EMULATE_DONE;
+}
+
+static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+		       int pid, int type)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+	int tid, esel;
+
+	/* invalidate all entries */
+	for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
+		tlbe = get_entry(vcpu_e500, tlbsel, esel);
+		tid = get_tlb_tid(tlbe);
+		if (type == 0 || tid == pid) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+		}
+	}
+}
+
+static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
+		       gva_t ea)
+{
+	int tlbsel, esel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
+		if (esel >= 0) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+			break;
+		}
+	}
+}
+
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int pid = get_cur_spid(vcpu);
+
+	if (type == 0 || type == 1) {
+		tlbilx_all(vcpu_e500, 0, pid, type);
+		tlbilx_all(vcpu_e500, 1, pid, type);
+	} else if (type == 3) {
+		tlbilx_one(vcpu_e500, pid, ea);
+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int tlbsel, esel;
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu, tlbsel);
+
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+	vcpu->arch.shared->mas0 &= ~MAS0_NV(~0);
+	vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+	vcpu->arch.shared->mas1 = gtlbe->mas1;
+	vcpu->arch.shared->mas2 = gtlbe->mas2;
+	vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int as = !!get_cur_sas(vcpu);
+	unsigned int pid = get_cur_spid(vcpu);
+	int esel, tlbsel;
+	struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
+		if (esel >= 0) {
+			gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+			break;
+		}
+	}
+
+	if (gtlbe) {
+		esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1;
+
+		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
+			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+		vcpu->arch.shared->mas1 = gtlbe->mas1;
+		vcpu->arch.shared->mas2 = gtlbe->mas2;
+		vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
+	} else {
+		int victim;
+
+		/* since we only have two TLBs, only lower bit is used. */
+		tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1;
+		victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
+
+		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel)
+			| MAS0_ESEL(victim)
+			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+		vcpu->arch.shared->mas1 =
+			  (vcpu->arch.shared->mas6 & MAS6_SPID0)
+			| ((vcpu->arch.shared->mas6 & MAS6_SAS) ? MAS1_TS : 0)
+			| (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
+		vcpu->arch.shared->mas2 &= MAS2_EPN;
+		vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
+					   MAS2_ATTRIB_MASK;
+		vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 |
+					     MAS3_U2 | MAS3_U3;
+	}
+
+	kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	int tlbsel, esel;
+	int recal = 0;
+	int idx;
+
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu, tlbsel);
+
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+
+	if (get_tlb_v(gtlbe)) {
+		inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+		if ((tlbsel == 1) &&
+			kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe))
+			recal = 1;
+	}
+
+	gtlbe->mas1 = vcpu->arch.shared->mas1;
+	gtlbe->mas2 = vcpu->arch.shared->mas2;
+	if (!(vcpu->arch.shared->msr & MSR_CM))
+		gtlbe->mas2 &= 0xffffffffUL;
+	gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
+
+	trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
+	                              gtlbe->mas2, gtlbe->mas7_3);
+
+	if (tlbsel == 1) {
+		/*
+		 * If a valid tlb1 entry is overwritten then recalculate the
+		 * min/max TLB1 map address range otherwise no need to look
+		 * in tlb1 array.
+		 */
+		if (recal)
+			kvmppc_recalc_tlb1map_range(vcpu_e500);
+		else
+			kvmppc_set_tlb1map_range(vcpu, gtlbe);
+	}
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
+	if (tlbe_is_host_safe(vcpu, gtlbe)) {
+		u64 eaddr = get_tlb_eaddr(gtlbe);
+		u64 raddr = get_tlb_raddr(gtlbe);
+
+		if (tlbsel == 0) {
+			gtlbe->mas1 &= ~MAS1_TSIZE(~0);
+			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+		}
+
+		/* Premap the faulting page */
+		kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel));
+	}
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
+				  gva_t eaddr, unsigned int pid, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel, tlbsel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
+		if (esel >= 0)
+			return index_of(tlbsel, esel);
+	}
+
+	return -1;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+	int index;
+	gva_t eaddr;
+	u8 pid;
+	u8 as;
+
+	eaddr = tr->linear_address;
+	pid = (tr->linear_address >> 32) & 0xff;
+	as = (tr->linear_address >> 40) & 0x1;
+
+	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
+	if (index < 0) {
+		tr->valid = 0;
+		return 0;
+	}
+
+	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
+	/* XXX what does "writeable" and "usermode" even mean? */
+	tr->valid = 1;
+
+	return 0;
+}
+
+
+int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
+
+	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
+}
+
+int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
+
+	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
+}
+
+void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
+
+	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.regs.nip, as);
+}
+
+void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
+
+	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as);
+}
+
+gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
+			gva_t eaddr)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	u64 pgmask;
+
+	gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index));
+	pgmask = get_tlb_bytes(gtlbe) - 1;
+
+	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
+}
+
+/*****************************************/
+
+static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int i;
+
+	kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
+	kfree(vcpu_e500->g2h_tlb1_map);
+	kfree(vcpu_e500->gtlb_priv[0]);
+	kfree(vcpu_e500->gtlb_priv[1]);
+
+	if (vcpu_e500->shared_tlb_pages) {
+		vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch,
+					  PAGE_SIZE)));
+
+		for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) {
+			set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]);
+			put_page(vcpu_e500->shared_tlb_pages[i]);
+		}
+
+		vcpu_e500->num_shared_tlb_pages = 0;
+
+		kfree(vcpu_e500->shared_tlb_pages);
+		vcpu_e500->shared_tlb_pages = NULL;
+	} else {
+		kfree(vcpu_e500->gtlb_arch);
+	}
+
+	vcpu_e500->gtlb_arch = NULL;
+}
+
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
+	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
+	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
+	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
+	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
+	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
+
+	sregs->u.e.mmucfg = vcpu->arch.mmucfg;
+	sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0];
+	sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1];
+	sregs->u.e.tlbcfg[2] = 0;
+	sregs->u.e.tlbcfg[3] = 0;
+}
+
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
+		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
+		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
+		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
+		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
+		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
+		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
+	}
+
+	return 0;
+}
+
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+				union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_MAS0:
+		*val = get_reg_val(id, vcpu->arch.shared->mas0);
+		break;
+	case KVM_REG_PPC_MAS1:
+		*val = get_reg_val(id, vcpu->arch.shared->mas1);
+		break;
+	case KVM_REG_PPC_MAS2:
+		*val = get_reg_val(id, vcpu->arch.shared->mas2);
+		break;
+	case KVM_REG_PPC_MAS7_3:
+		*val = get_reg_val(id, vcpu->arch.shared->mas7_3);
+		break;
+	case KVM_REG_PPC_MAS4:
+		*val = get_reg_val(id, vcpu->arch.shared->mas4);
+		break;
+	case KVM_REG_PPC_MAS6:
+		*val = get_reg_val(id, vcpu->arch.shared->mas6);
+		break;
+	case KVM_REG_PPC_MMUCFG:
+		*val = get_reg_val(id, vcpu->arch.mmucfg);
+		break;
+	case KVM_REG_PPC_EPTCFG:
+		*val = get_reg_val(id, vcpu->arch.eptcfg);
+		break;
+	case KVM_REG_PPC_TLB0CFG:
+	case KVM_REG_PPC_TLB1CFG:
+	case KVM_REG_PPC_TLB2CFG:
+	case KVM_REG_PPC_TLB3CFG:
+		i = id - KVM_REG_PPC_TLB0CFG;
+		*val = get_reg_val(id, vcpu->arch.tlbcfg[i]);
+		break;
+	case KVM_REG_PPC_TLB0PS:
+	case KVM_REG_PPC_TLB1PS:
+	case KVM_REG_PPC_TLB2PS:
+	case KVM_REG_PPC_TLB3PS:
+		i = id - KVM_REG_PPC_TLB0PS;
+		*val = get_reg_val(id, vcpu->arch.tlbps[i]);
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+			       union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_MAS0:
+		vcpu->arch.shared->mas0 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS1:
+		vcpu->arch.shared->mas1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS2:
+		vcpu->arch.shared->mas2 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS7_3:
+		vcpu->arch.shared->mas7_3 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS4:
+		vcpu->arch.shared->mas4 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS6:
+		vcpu->arch.shared->mas6 = set_reg_val(id, *val);
+		break;
+	/* Only allow MMU registers to be set to the config supported by KVM */
+	case KVM_REG_PPC_MMUCFG: {
+		u32 reg = set_reg_val(id, *val);
+		if (reg != vcpu->arch.mmucfg)
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_EPTCFG: {
+		u32 reg = set_reg_val(id, *val);
+		if (reg != vcpu->arch.eptcfg)
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_TLB0CFG:
+	case KVM_REG_PPC_TLB1CFG:
+	case KVM_REG_PPC_TLB2CFG:
+	case KVM_REG_PPC_TLB3CFG: {
+		/* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */
+		u32 reg = set_reg_val(id, *val);
+		i = id - KVM_REG_PPC_TLB0CFG;
+		if (reg != vcpu->arch.tlbcfg[i])
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_TLB0PS:
+	case KVM_REG_PPC_TLB1PS:
+	case KVM_REG_PPC_TLB2PS:
+	case KVM_REG_PPC_TLB3PS: {
+		u32 reg = set_reg_val(id, *val);
+		i = id - KVM_REG_PPC_TLB0PS;
+		if (reg != vcpu->arch.tlbps[i])
+			r = -EINVAL;
+		break;
+	}
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
+		struct kvm_book3e_206_tlb_params *params)
+{
+	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	if (params->tlb_sizes[0] <= 2048)
+		vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0];
+	vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1];
+	vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+	return 0;
+}
+
+int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
+			      struct kvm_config_tlb *cfg)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_params params;
+	char *virt;
+	struct page **pages;
+	struct tlbe_priv *privs[2] = {};
+	u64 *g2h_bitmap;
+	size_t array_len;
+	u32 sets;
+	int num_pages, ret, i;
+
+	if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV)
+		return -EINVAL;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)cfg->params,
+			   sizeof(params)))
+		return -EFAULT;
+
+	if (params.tlb_sizes[1] > 64)
+		return -EINVAL;
+	if (params.tlb_ways[1] != params.tlb_sizes[1])
+		return -EINVAL;
+	if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0)
+		return -EINVAL;
+	if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0)
+		return -EINVAL;
+
+	if (!is_power_of_2(params.tlb_ways[0]))
+		return -EINVAL;
+
+	sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]);
+	if (!is_power_of_2(sets))
+		return -EINVAL;
+
+	array_len = params.tlb_sizes[0] + params.tlb_sizes[1];
+	array_len *= sizeof(struct kvm_book3e_206_tlb_entry);
+
+	if (cfg->array_len < array_len)
+		return -EINVAL;
+
+	num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
+		    cfg->array / PAGE_SIZE;
+	pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
+	if (ret < 0)
+		goto free_pages;
+
+	if (ret != num_pages) {
+		num_pages = ret;
+		ret = -EFAULT;
+		goto put_pages;
+	}
+
+	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
+	if (!virt) {
+		ret = -ENOMEM;
+		goto put_pages;
+	}
+
+	privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
+	if (!privs[0]) {
+		ret = -ENOMEM;
+		goto put_pages;
+	}
+
+	privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
+	if (!privs[1]) {
+		ret = -ENOMEM;
+		goto free_privs_first;
+	}
+
+	g2h_bitmap = kcalloc(params.tlb_sizes[1],
+			     sizeof(*g2h_bitmap),
+			     GFP_KERNEL);
+	if (!g2h_bitmap) {
+		ret = -ENOMEM;
+		goto free_privs_second;
+	}
+
+	free_gtlb(vcpu_e500);
+
+	vcpu_e500->gtlb_priv[0] = privs[0];
+	vcpu_e500->gtlb_priv[1] = privs[1];
+	vcpu_e500->g2h_tlb1_map = g2h_bitmap;
+
+	vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *)
+		(virt + (cfg->array & (PAGE_SIZE - 1)));
+
+	vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0];
+	vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1];
+
+	vcpu_e500->gtlb_offset[0] = 0;
+	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
+
+	/* Update vcpu's MMU geometry based on SW_TLB input */
+	vcpu_mmu_geometry_update(vcpu, &params);
+
+	vcpu_e500->shared_tlb_pages = pages;
+	vcpu_e500->num_shared_tlb_pages = num_pages;
+
+	vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0];
+	vcpu_e500->gtlb_params[0].sets = sets;
+
+	vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1];
+	vcpu_e500->gtlb_params[1].sets = 1;
+
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
+	return 0;
+ free_privs_second:
+	kfree(privs[1]);
+ free_privs_first:
+	kfree(privs[0]);
+ put_pages:
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+ free_pages:
+	kfree(pages);
+	return ret;
+}
+
+int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
+			     struct kvm_dirty_tlb *dirty)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
+	kvmppc_core_flush_tlb(vcpu);
+	return 0;
+}
+
+/* Vcpu's MMU default configuration */
+static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
+		       struct kvmppc_e500_tlb_params *params)
+{
+	/* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/
+	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+	/* Initialize TLBnCFG fields with host values and SW_TLB geometry*/
+	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[0] |= params[0].entries;
+	vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params[1].entries;
+	vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT;
+
+	if (has_feature(vcpu, VCPU_FTR_MMU_V2)) {
+		vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS);
+		vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS);
+
+		vcpu->arch.mmucfg &= ~MMUCFG_LRAT;
+
+		/* Guest mmu emulation currently doesn't handle E.PT */
+		vcpu->arch.eptcfg = 0;
+		vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT;
+		vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND;
+	}
+
+	return 0;
+}
+
+int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
+
+	if (e500_mmu_host_init(vcpu_e500))
+		goto free_vcpu;
+
+	vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
+	vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
+
+	vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM;
+	vcpu_e500->gtlb_params[0].sets =
+		KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM;
+
+	vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
+	vcpu_e500->gtlb_params[1].sets = 1;
+
+	vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+					     KVM_E500_TLB1_SIZE,
+					     sizeof(*vcpu_e500->gtlb_arch),
+					     GFP_KERNEL);
+	if (!vcpu_e500->gtlb_arch)
+		return -ENOMEM;
+
+	vcpu_e500->gtlb_offset[0] = 0;
+	vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
+
+	vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+					  sizeof(struct tlbe_ref),
+					  GFP_KERNEL);
+	if (!vcpu_e500->gtlb_priv[0])
+		goto free_vcpu;
+
+	vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+					  sizeof(struct tlbe_ref),
+					  GFP_KERNEL);
+	if (!vcpu_e500->gtlb_priv[1])
+		goto free_vcpu;
+
+	vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+					  sizeof(*vcpu_e500->g2h_tlb1_map),
+					  GFP_KERNEL);
+	if (!vcpu_e500->g2h_tlb1_map)
+		goto free_vcpu;
+
+	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
+
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
+	return 0;
+ free_vcpu:
+	free_gtlb(vcpu_e500);
+	return -1;
+}
+
+void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	free_gtlb(vcpu_e500);
+	e500_mmu_host_uninit(vcpu_e500);
+}
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
new file mode 100644
index 0000000000..ccb8f16ffe
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -0,0 +1,803 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *         Scott Wood, scottwood@freescale.com
+ *         Ashish Kalra, ashish.kalra@freescale.com
+ *         Varun Sethi, varun.sethi@freescale.com
+ *         Alexander Graf, agraf@suse.de
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <asm/kvm_ppc.h>
+#include <asm/pte-walk.h>
+
+#include "e500.h"
+#include "timing.h"
+#include "e500_mmu_host.h"
+
+#include "trace_booke.h"
+
+#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
+
+static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
+
+static inline unsigned int tlb1_max_shadow_size(void)
+{
+	/* reserve one entry for magic page */
+	return host_tlb_params[1].entries - tlbcam_index - 1;
+}
+
+static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
+{
+	/* Mask off reserved bits. */
+	mas3 &= MAS3_ATTRIB_MASK;
+
+#ifndef CONFIG_KVM_BOOKE_HV
+	if (!usermode) {
+		/* Guest is in supervisor mode,
+		 * so we need to translate guest
+		 * supervisor permissions into user permissions. */
+		mas3 &= ~E500_TLB_USER_PERM_MASK;
+		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
+	}
+	mas3 |= E500_TLB_SUPER_PERM_MASK;
+#endif
+	return mas3;
+}
+
+/*
+ * writing shadow tlb entry to host TLB
+ */
+static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
+				     uint32_t mas0,
+				     uint32_t lpid)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS0, mas0);
+	mtspr(SPRN_MAS1, stlbe->mas1);
+	mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
+	mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
+	mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_MAS8, MAS8_TGS | get_thread_specific_lpid(lpid));
+#endif
+	asm volatile("isync; tlbwe" : : : "memory");
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/* Must clear mas8 for other host tlbwe's */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+#endif
+	local_irq_restore(flags);
+
+	trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
+	                              stlbe->mas2, stlbe->mas7_3);
+}
+
+/*
+ * Acquire a mas0 with victim hint, as if we just took a TLB miss.
+ *
+ * We don't care about the address we're searching for, other than that it's
+ * in the right set and is not present in the TLB.  Using a zero PID and a
+ * userspace address means we don't have to set and then restore MAS5, or
+ * calculate a proper MAS6 value.
+ */
+static u32 get_host_mas0(unsigned long eaddr)
+{
+	unsigned long flags;
+	u32 mas0;
+	u32 mas4;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS6, 0);
+	mas4 = mfspr(SPRN_MAS4);
+	mtspr(SPRN_MAS4, mas4 & ~MAS4_TLBSEL_MASK);
+	asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
+	mas0 = mfspr(SPRN_MAS0);
+	mtspr(SPRN_MAS4, mas4);
+	local_irq_restore(flags);
+
+	return mas0;
+}
+
+/* sesel is for tlb1 only */
+static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
+{
+	u32 mas0;
+
+	if (tlbsel == 0) {
+		mas0 = get_host_mas0(stlbe->mas2);
+		__write_host_tlbe(stlbe, mas0, vcpu_e500->vcpu.kvm->arch.lpid);
+	} else {
+		__write_host_tlbe(stlbe,
+				  MAS0_TLBSEL(1) |
+				  MAS0_ESEL(to_htlb1_esel(sesel)),
+				  vcpu_e500->vcpu.kvm->arch.lpid);
+	}
+}
+
+/* sesel is for tlb1 only */
+static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+			struct kvm_book3e_206_tlb_entry *gtlbe,
+			struct kvm_book3e_206_tlb_entry *stlbe,
+			int stlbsel, int sesel)
+{
+	int stid;
+
+	preempt_disable();
+	stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
+
+	stlbe->mas1 |= MAS1_TID(stid);
+	write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
+	preempt_enable();
+}
+
+#ifdef CONFIG_KVM_E500V2
+/* XXX should be a hook in the gva2hpa translation */
+void kvmppc_map_magic(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_entry magic;
+	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+	unsigned int stid;
+	kvm_pfn_t pfn;
+
+	pfn = (kvm_pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+	get_page(pfn_to_page(pfn));
+
+	preempt_disable();
+	stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
+
+	magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
+		     MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
+	magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+		       MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+	magic.mas8 = 0;
+
+	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index), 0);
+	preempt_enable();
+}
+#endif
+
+void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+			 int esel)
+{
+	struct kvm_book3e_206_tlb_entry *gtlbe =
+		get_entry(vcpu_e500, tlbsel, esel);
+	struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref;
+
+	/* Don't bother with unmapped entries */
+	if (!(ref->flags & E500_TLB_VALID)) {
+		WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0),
+		     "%s: flags %x\n", __func__, ref->flags);
+		WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]);
+	}
+
+	if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) {
+		u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
+		int hw_tlb_indx;
+		unsigned long flags;
+
+		local_irq_save(flags);
+		while (tmp) {
+			hw_tlb_indx = __ilog2_u64(tmp & -tmp);
+			mtspr(SPRN_MAS0,
+			      MAS0_TLBSEL(1) |
+			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
+			mtspr(SPRN_MAS1, 0);
+			asm volatile("tlbwe");
+			vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
+			tmp &= tmp - 1;
+		}
+		mb();
+		vcpu_e500->g2h_tlb1_map[esel] = 0;
+		ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID);
+		local_irq_restore(flags);
+	}
+
+	if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) {
+		/*
+		 * TLB1 entry is backed by 4k pages. This should happen
+		 * rarely and is not worth optimizing. Invalidate everything.
+		 */
+		kvmppc_e500_tlbil_all(vcpu_e500);
+		ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID);
+	}
+
+	/*
+	 * If TLB entry is still valid then it's a TLB0 entry, and thus
+	 * backed by at most one host tlbe per shadow pid
+	 */
+	if (ref->flags & E500_TLB_VALID)
+		kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
+
+	/* Mark the TLB as not backed by the host anymore */
+	ref->flags = 0;
+}
+
+static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
+}
+
+static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
+					 struct kvm_book3e_206_tlb_entry *gtlbe,
+					 kvm_pfn_t pfn, unsigned int wimg)
+{
+	ref->pfn = pfn;
+	ref->flags = E500_TLB_VALID;
+
+	/* Use guest supplied MAS2_G and MAS2_E */
+	ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
+
+	/* Mark the page accessed */
+	kvm_set_pfn_accessed(pfn);
+
+	if (tlbe_is_writable(gtlbe))
+		kvm_set_pfn_dirty(pfn);
+}
+
+static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
+{
+	if (ref->flags & E500_TLB_VALID) {
+		/* FIXME: don't log bogus pfn for TLB1 */
+		trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
+		ref->flags = 0;
+	}
+}
+
+static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	if (vcpu_e500->g2h_tlb1_map)
+		memset(vcpu_e500->g2h_tlb1_map, 0,
+		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
+	if (vcpu_e500->h2g_tlb1_rmap)
+		memset(vcpu_e500->h2g_tlb1_rmap, 0,
+		       sizeof(unsigned int) * host_tlb_params[1].entries);
+}
+
+static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int tlbsel;
+	int i;
+
+	for (tlbsel = 0; tlbsel <= 1; tlbsel++) {
+		for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
+			struct tlbe_ref *ref =
+				&vcpu_e500->gtlb_priv[tlbsel][i].ref;
+			kvmppc_e500_ref_release(ref);
+		}
+	}
+}
+
+void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	kvmppc_e500_tlbil_all(vcpu_e500);
+	clear_tlb_privs(vcpu_e500);
+	clear_tlb1_bitmap(vcpu_e500);
+}
+
+/* TID must be supplied by the caller */
+static void kvmppc_e500_setup_stlbe(
+	struct kvm_vcpu *vcpu,
+	struct kvm_book3e_206_tlb_entry *gtlbe,
+	int tsize, struct tlbe_ref *ref, u64 gvaddr,
+	struct kvm_book3e_206_tlb_entry *stlbe)
+{
+	kvm_pfn_t pfn = ref->pfn;
+	u32 pr = vcpu->arch.shared->msr & MSR_PR;
+
+	BUG_ON(!(ref->flags & E500_TLB_VALID));
+
+	/* Force IPROT=0 for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR);
+	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
+}
+
+static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+	u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+	int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
+	struct tlbe_ref *ref)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long pfn = 0; /* silence GCC warning */
+	unsigned long hva;
+	int pfnmap = 0;
+	int tsize = BOOK3E_PAGESZ_4K;
+	int ret = 0;
+	unsigned long mmu_seq;
+	struct kvm *kvm = vcpu_e500->vcpu.kvm;
+	unsigned long tsize_pages = 0;
+	pte_t *ptep;
+	unsigned int wimg = 0;
+	pgd_t *pgdir;
+	unsigned long flags;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_invalidate_seq;
+	smp_rmb();
+
+	/*
+	 * Translate guest physical to true physical, acquiring
+	 * a page reference if it is normal, non-reserved memory.
+	 *
+	 * gfn_to_memslot() must succeed because otherwise we wouldn't
+	 * have gotten this far.  Eventually we should just pass the slot
+	 * pointer through from the first lookup.
+	 */
+	slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	if (tlbsel == 1) {
+		struct vm_area_struct *vma;
+		mmap_read_lock(kvm->mm);
+
+		vma = find_vma(kvm->mm, hva);
+		if (vma && hva >= vma->vm_start &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			/*
+			 * This VMA is a physically contiguous region (e.g.
+			 * /dev/mem) that bypasses normal Linux page
+			 * management.  Find the overlap between the
+			 * vma and the memslot.
+			 */
+
+			unsigned long start, end;
+			unsigned long slot_start, slot_end;
+
+			pfnmap = 1;
+
+			start = vma->vm_pgoff;
+			end = start +
+			      vma_pages(vma);
+
+			pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
+
+			slot_start = pfn - (gfn - slot->base_gfn);
+			slot_end = slot_start + slot->npages;
+
+			if (start < slot_start)
+				start = slot_start;
+			if (end > slot_end)
+				end = slot_end;
+
+			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+				MAS1_TSIZE_SHIFT;
+
+			/*
+			 * e500 doesn't implement the lowest tsize bit,
+			 * or 1K pages.
+			 */
+			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+			/*
+			 * Now find the largest tsize (up to what the guest
+			 * requested) that will cover gfn, stay within the
+			 * range, and for which gfn and pfn are mutually
+			 * aligned.
+			 */
+
+			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+				unsigned long gfn_start, gfn_end;
+				tsize_pages = 1UL << (tsize - 2);
+
+				gfn_start = gfn & ~(tsize_pages - 1);
+				gfn_end = gfn_start + tsize_pages;
+
+				if (gfn_start + pfn - gfn < start)
+					continue;
+				if (gfn_end + pfn - gfn > end)
+					continue;
+				if ((gfn & (tsize_pages - 1)) !=
+				    (pfn & (tsize_pages - 1)))
+					continue;
+
+				gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+				pfn &= ~(tsize_pages - 1);
+				break;
+			}
+		} else if (vma && hva >= vma->vm_start &&
+			   is_vm_hugetlb_page(vma)) {
+			unsigned long psize = vma_kernel_pagesize(vma);
+
+			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+				MAS1_TSIZE_SHIFT;
+
+			/*
+			 * Take the largest page size that satisfies both host
+			 * and guest mapping
+			 */
+			tsize = min(__ilog2(psize) - 10, tsize);
+
+			/*
+			 * e500 doesn't implement the lowest tsize bit,
+			 * or 1K pages.
+			 */
+			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+		}
+
+		mmap_read_unlock(kvm->mm);
+	}
+
+	if (likely(!pfnmap)) {
+		tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
+		pfn = gfn_to_pfn_memslot(slot, gfn);
+		if (is_error_noslot_pfn(pfn)) {
+			if (printk_ratelimit())
+				pr_err("%s: real page not found for gfn %lx\n",
+				       __func__, (long)gfn);
+			return -EINVAL;
+		}
+
+		/* Align guest and physical address to page map boundaries */
+		pfn &= ~(tsize_pages - 1);
+		gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+	}
+
+	spin_lock(&kvm->mmu_lock);
+	if (mmu_invalidate_retry(kvm, mmu_seq)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+
+	pgdir = vcpu_e500->vcpu.arch.pgdir;
+	/*
+	 * We are just looking at the wimg bits, so we don't
+	 * care much about the trans splitting bit.
+	 * We are holding kvm->mmu_lock so a notifier invalidate
+	 * can't run hence pfn won't change.
+	 */
+	local_irq_save(flags);
+	ptep = find_linux_pte(pgdir, hva, NULL, NULL);
+	if (ptep) {
+		pte_t pte = READ_ONCE(*ptep);
+
+		if (pte_present(pte)) {
+			wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
+				MAS2_WIMGE_MASK;
+			local_irq_restore(flags);
+		} else {
+			local_irq_restore(flags);
+			pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
+					   __func__, (long)gfn, pfn);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+	kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
+
+	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
+				ref, gvaddr, stlbe);
+
+	/* Clear i-cache for new pages */
+	kvmppc_mmu_flush_icache(pfn);
+
+out:
+	spin_unlock(&kvm->mmu_lock);
+
+	/* Drop refcount on page, so that mmu notifiers can clear it */
+	kvm_release_pfn_clean(pfn);
+
+	return ret;
+}
+
+/* XXX only map the one-one case, for now use TLB0 */
+static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel,
+				struct kvm_book3e_206_tlb_entry *stlbe)
+{
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	struct tlbe_ref *ref;
+	int stlbsel = 0;
+	int sesel = 0;
+	int r;
+
+	gtlbe = get_entry(vcpu_e500, 0, esel);
+	ref = &vcpu_e500->gtlb_priv[0][esel].ref;
+
+	r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
+			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
+			gtlbe, 0, stlbe, ref);
+	if (r)
+		return r;
+
+	write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel);
+
+	return 0;
+}
+
+static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500,
+				     struct tlbe_ref *ref,
+				     int esel)
+{
+	unsigned int sesel = vcpu_e500->host_tlb1_nv++;
+
+	if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
+		vcpu_e500->host_tlb1_nv = 0;
+
+	if (vcpu_e500->h2g_tlb1_rmap[sesel]) {
+		unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1;
+		vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel);
+	}
+
+	vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
+	vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel;
+	vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1;
+	WARN_ON(!(ref->flags & E500_TLB_VALID));
+
+	return sesel;
+}
+
+/* Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB. */
+/* For both one-one and one-to-many */
+static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+		u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+		struct kvm_book3e_206_tlb_entry *stlbe, int esel)
+{
+	struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref;
+	int sesel;
+	int r;
+
+	r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe,
+				   ref);
+	if (r)
+		return r;
+
+	/* Use TLB0 when we can only map a page with 4k */
+	if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) {
+		vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0;
+		write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0);
+		return 0;
+	}
+
+	/* Otherwise map into TLB1 */
+	sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel);
+	write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel);
+
+	return 0;
+}
+
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
+		    unsigned int index)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct tlbe_priv *priv;
+	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
+	int tlbsel = tlbsel_of(index);
+	int esel = esel_of(index);
+
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+
+	switch (tlbsel) {
+	case 0:
+		priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
+
+		/* Triggers after clear_tlb_privs or on initial mapping */
+		if (!(priv->ref.flags & E500_TLB_VALID)) {
+			kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+		} else {
+			kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
+						&priv->ref, eaddr, &stlbe);
+			write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0);
+		}
+		break;
+
+	case 1: {
+		gfn_t gfn = gpaddr >> PAGE_SHIFT;
+		kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe,
+				     esel);
+		break;
+	}
+
+	default:
+		BUG();
+		break;
+	}
+}
+
+#ifdef CONFIG_KVM_BOOKE_HV
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+		enum instruction_fetch_type type, unsigned long *instr)
+{
+	gva_t geaddr;
+	hpa_t addr;
+	hfn_t pfn;
+	hva_t eaddr;
+	u32 mas1, mas2, mas3;
+	u64 mas7_mas3;
+	struct page *page;
+	unsigned int addr_space, psize_shift;
+	bool pr;
+	unsigned long flags;
+
+	/* Search TLB for guest pc to get the real address */
+	geaddr = kvmppc_get_pc(vcpu);
+
+	addr_space = (vcpu->arch.shared->msr & MSR_IS) >> MSR_IR_LG;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS6, (vcpu->arch.pid << MAS6_SPID_SHIFT) | addr_space);
+	mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(vcpu));
+	asm volatile("tlbsx 0, %[geaddr]\n" : :
+		     [geaddr] "r" (geaddr));
+	mtspr(SPRN_MAS5, 0);
+	mtspr(SPRN_MAS8, 0);
+	mas1 = mfspr(SPRN_MAS1);
+	mas2 = mfspr(SPRN_MAS2);
+	mas3 = mfspr(SPRN_MAS3);
+#ifdef CONFIG_64BIT
+	mas7_mas3 = mfspr(SPRN_MAS7_MAS3);
+#else
+	mas7_mas3 = ((u64)mfspr(SPRN_MAS7) << 32) | mas3;
+#endif
+	local_irq_restore(flags);
+
+	/*
+	 * If the TLB entry for guest pc was evicted, return to the guest.
+	 * There are high chances to find a valid TLB entry next time.
+	 */
+	if (!(mas1 & MAS1_VALID))
+		return EMULATE_AGAIN;
+
+	/*
+	 * Another thread may rewrite the TLB entry in parallel, don't
+	 * execute from the address if the execute permission is not set
+	 */
+	pr = vcpu->arch.shared->msr & MSR_PR;
+	if (unlikely((pr && !(mas3 & MAS3_UX)) ||
+		     (!pr && !(mas3 & MAS3_SX)))) {
+		pr_err_ratelimited(
+			"%s: Instruction emulation from guest address %08lx without execute permission\n",
+			__func__, geaddr);
+		return EMULATE_AGAIN;
+	}
+
+	/*
+	 * The real address will be mapped by a cacheable, memory coherent,
+	 * write-back page. Check for mismatches when LRAT is used.
+	 */
+	if (has_feature(vcpu, VCPU_FTR_MMU_V2) &&
+	    unlikely((mas2 & MAS2_I) || (mas2 & MAS2_W) || !(mas2 & MAS2_M))) {
+		pr_err_ratelimited(
+			"%s: Instruction emulation from guest address %08lx mismatches storage attributes\n",
+			__func__, geaddr);
+		return EMULATE_AGAIN;
+	}
+
+	/* Get pfn */
+	psize_shift = MAS1_GET_TSIZE(mas1) + 10;
+	addr = (mas7_mas3 & (~0ULL << psize_shift)) |
+	       (geaddr & ((1ULL << psize_shift) - 1ULL));
+	pfn = addr >> PAGE_SHIFT;
+
+	/* Guard against emulation from devices area */
+	if (unlikely(!page_is_ram(pfn))) {
+		pr_err_ratelimited("%s: Instruction emulation from non-RAM host address %08llx is not supported\n",
+			 __func__, addr);
+		return EMULATE_AGAIN;
+	}
+
+	/* Map a page and get guest's instruction */
+	page = pfn_to_page(pfn);
+	eaddr = (unsigned long)kmap_atomic(page);
+	*instr = *(u32 *)(eaddr | (unsigned long)(addr & ~PAGE_MASK));
+	kunmap_atomic((u32 *)eaddr);
+
+	return EMULATE_DONE;
+}
+#else
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+		enum instruction_fetch_type type, unsigned long *instr)
+{
+	return EMULATE_AGAIN;
+}
+#endif
+
+/************* MMU Notifiers *************/
+
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/*
+	 * Flush all shadow tlb entries everywhere. This is slow, but
+	 * we are 100% sure that we catch the to be unmapped page
+	 */
+	return true;
+}
+
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return kvm_e500_mmu_unmap_gfn(kvm, range);
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/* XXX could be more clever ;) */
+	return false;
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/* XXX could be more clever ;) */
+	return false;
+}
+
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	/* The page will get remapped properly on its next fault */
+	return kvm_e500_mmu_unmap_gfn(kvm, range);
+}
+
+/*****************************************/
+
+int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
+	host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	/*
+	 * This should never happen on real e500 hardware, but is
+	 * architecturally possible -- e.g. in some weird nested
+	 * virtualization case.
+	 */
+	if (host_tlb_params[0].entries == 0 ||
+	    host_tlb_params[1].entries == 0) {
+		pr_err("%s: need to know host tlb size\n", __func__);
+		return -ENODEV;
+	}
+
+	host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
+				  TLBnCFG_ASSOC_SHIFT;
+	host_tlb_params[1].ways = host_tlb_params[1].entries;
+
+	if (!is_power_of_2(host_tlb_params[0].entries) ||
+	    !is_power_of_2(host_tlb_params[0].ways) ||
+	    host_tlb_params[0].entries < host_tlb_params[0].ways ||
+	    host_tlb_params[0].ways == 0) {
+		pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
+		       __func__, host_tlb_params[0].entries,
+		       host_tlb_params[0].ways);
+		return -ENODEV;
+	}
+
+	host_tlb_params[0].sets =
+		host_tlb_params[0].entries / host_tlb_params[0].ways;
+	host_tlb_params[1].sets = 1;
+	vcpu_e500->h2g_tlb1_rmap = kcalloc(host_tlb_params[1].entries,
+					   sizeof(*vcpu_e500->h2g_tlb1_rmap),
+					   GFP_KERNEL);
+	if (!vcpu_e500->h2g_tlb1_rmap)
+		return -EINVAL;
+
+	return 0;
+}
+
+void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->h2g_tlb1_rmap);
+}
diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h
new file mode 100644
index 0000000000..d8178cc86b
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ */
+
+#ifndef KVM_E500_MMU_HOST_H
+#define KVM_E500_MMU_HOST_H
+
+void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+			 int esel);
+
+int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500);
+void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#endif /* KVM_E500_MMU_HOST_H */
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
new file mode 100644
index 0000000000..e476e107a9
--- /dev/null
+++ b/arch/powerpc/kvm/e500mc.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Varun Sethi, <varun.sethi@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/e500.c,
+ * by Yu Liu <yu.liu@freescale.com>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
+#include <asm/ppc-opcode.h>
+
+#include "booke.h"
+#include "e500.h"
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type)
+{
+	enum ppc_dbell dbell_type;
+	unsigned long tag;
+
+	switch (type) {
+	case INT_CLASS_NONCRIT:
+		dbell_type = PPC_G_DBELL;
+		break;
+	case INT_CLASS_CRIT:
+		dbell_type = PPC_G_DBELL_CRIT;
+		break;
+	case INT_CLASS_MC:
+		dbell_type = PPC_G_DBELL_MC;
+		break;
+	default:
+		WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type);
+		return;
+	}
+
+	preempt_disable();
+	tag = PPC_DBELL_LPID(get_lpid(vcpu)) | vcpu->vcpu_id;
+	mb();
+	ppc_msgsnd(dbell_type, 0, tag);
+	preempt_enable();
+}
+
+/* gtlbe must not be mapped by more than one host tlb entry */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned int tid, ts;
+	gva_t eaddr;
+	u32 val;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	/* We search the host TLB to invalidate its shadow TLB entry */
+	val = (tid << 16) | ts;
+	eaddr = get_tlb_eaddr(gtlbe);
+
+	local_irq_save(flags);
+
+	mtspr(SPRN_MAS6, val);
+	mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu));
+
+	asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr));
+	val = mfspr(SPRN_MAS1);
+	if (val & MAS1_VALID) {
+		mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+		asm volatile("tlbwe");
+	}
+	mtspr(SPRN_MAS5, 0);
+	/* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+
+	local_irq_restore(flags);
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu));
+	/*
+	 * clang-17 and older could not assemble tlbilxlpid.
+	 * https://github.com/ClangBuiltLinux/linux/issues/1891
+	 */
+	asm volatile (PPC_TLBILX_LPID);
+	mtspr(SPRN_MAS5, 0);
+	local_irq_restore(flags);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	vcpu->arch.pid = pid;
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+}
+
+/* We use two lpids per VM */
+static DEFINE_PER_CPU(struct kvm_vcpu *[KVMPPC_NR_LPIDS], last_vcpu_of_lpid);
+
+static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_booke_vcpu_load(vcpu, cpu);
+
+	mtspr(SPRN_LPID, get_lpid(vcpu));
+	mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+	mtspr(SPRN_GPIR, vcpu->vcpu_id);
+	mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp);
+	vcpu->arch.eplc = EPC_EGS | (get_lpid(vcpu) << EPC_ELPID_SHIFT);
+	vcpu->arch.epsc = vcpu->arch.eplc;
+	mtspr(SPRN_EPLC, vcpu->arch.eplc);
+	mtspr(SPRN_EPSC, vcpu->arch.epsc);
+
+	mtspr(SPRN_GIVPR, vcpu->arch.ivpr);
+	mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
+	mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
+	mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0);
+	mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1);
+	mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2);
+	mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3);
+
+	mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0);
+	mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1);
+
+	mtspr(SPRN_GEPR, vcpu->arch.epr);
+	mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
+	mtspr(SPRN_GESR, vcpu->arch.shared->esr);
+
+	if (vcpu->arch.oldpir != mfspr(SPRN_PIR) ||
+	    __this_cpu_read(last_vcpu_of_lpid[get_lpid(vcpu)]) != vcpu) {
+		kvmppc_e500_tlbil_all(vcpu_e500);
+		__this_cpu_write(last_vcpu_of_lpid[get_lpid(vcpu)], vcpu);
+	}
+}
+
+static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.eplc = mfspr(SPRN_EPLC);
+	vcpu->arch.epsc = mfspr(SPRN_EPSC);
+
+	vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0);
+	vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1);
+	vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2);
+	vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3);
+
+	vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0);
+	vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1);
+
+	vcpu->arch.epr = mfspr(SPRN_GEPR);
+	vcpu->arch.shared->dar = mfspr(SPRN_GDEAR);
+	vcpu->arch.shared->esr = mfspr(SPRN_GESR);
+
+	vcpu->arch.oldpir = mfspr(SPRN_PIR);
+
+	kvmppc_booke_vcpu_put(vcpu);
+}
+
+static int kvmppc_e500mc_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0)
+		r = 0;
+	else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
+		r = 0;
+#ifdef CONFIG_ALTIVEC
+	/*
+	 * Since guests have the privilege to enable AltiVec, we need AltiVec
+	 * support in the host to save/restore their context.
+	 * Don't use CPU_FTR_ALTIVEC to identify cores with AltiVec unit
+	 * because it's cleared in the absence of CONFIG_ALTIVEC!
+	 */
+	else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0)
+		r = 0;
+#endif
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
+				 SPRN_EPCR_DUVD;
+#ifdef CONFIG_64BIT
+	vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM;
+#endif
+	vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_PMMP;
+
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu_e500->svr = mfspr(SPRN_SVR);
+
+	vcpu->arch.cpu_type = KVM_CPU_E500MC;
+
+	return 0;
+}
+
+static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu,
+					struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM |
+			       KVM_SREGS_E_PC;
+	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL;
+
+	sregs->u.e.impl.fsl.features = 0;
+	sregs->u.e.impl.fsl.svr = vcpu_e500->svr;
+	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
+	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
+
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+
+	sregs->u.e.ivor_high[3] =
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+	sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
+	sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
+
+	return kvmppc_get_sregs_ivor(vcpu, sregs);
+}
+
+static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu,
+					struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
+
+	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
+		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
+		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0;
+		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
+	}
+
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	if (sregs->u.e.features & KVM_SREGS_E_PM) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] =
+			sregs->u.e.ivor_high[3];
+	}
+
+	if (sregs->u.e.features & KVM_SREGS_E_PC) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] =
+			sregs->u.e.ivor_high[4];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] =
+			sregs->u.e.ivor_high[5];
+	}
+
+	return kvmppc_set_sregs_ivor(vcpu, sregs);
+}
+
+static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id,
+			      union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_SPRG9:
+		*val = get_reg_val(id, vcpu->arch.sprg9);
+		break;
+	default:
+		r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	}
+
+	return r;
+}
+
+static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id,
+			      union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_SPRG9:
+		vcpu->arch.sprg9 = set_reg_val(id, *val);
+		break;
+	default:
+		r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
+	}
+
+	return r;
+}
+
+static int kvmppc_core_vcpu_create_e500mc(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	int err;
+
+	BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0);
+	vcpu_e500 = to_e500(vcpu);
+
+	/* Invalid PIR value -- this LPID doesn't have valid state on any cpu */
+	vcpu->arch.oldpir = 0xffffffff;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		return err;
+
+	vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!vcpu->arch.shared) {
+		err = -ENOMEM;
+		goto uninit_tlb;
+	}
+
+	return 0;
+
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	return err;
+}
+
+static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared);
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+}
+
+static int kvmppc_core_init_vm_e500mc(struct kvm *kvm)
+{
+	int lpid;
+
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		return lpid;
+
+	/*
+	 * Use two lpids per VM on cores with two threads like e6500. Use
+	 * even numbers to speedup vcpu lpid computation with consecutive lpids
+	 * per VM. vm1 will use lpids 2 and 3, vm2 lpids 4 and 5, and so on.
+	 */
+	if (threads_per_core == 2)
+		lpid <<= 1;
+
+	kvm->arch.lpid = lpid;
+	return 0;
+}
+
+static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm)
+{
+	int lpid = kvm->arch.lpid;
+
+	if (threads_per_core == 2)
+		lpid >>= 1;
+
+	kvmppc_free_lpid(lpid);
+}
+
+static struct kvmppc_ops kvm_ops_e500mc = {
+	.get_sregs = kvmppc_core_get_sregs_e500mc,
+	.set_sregs = kvmppc_core_set_sregs_e500mc,
+	.get_one_reg = kvmppc_get_one_reg_e500mc,
+	.set_one_reg = kvmppc_set_one_reg_e500mc,
+	.vcpu_load   = kvmppc_core_vcpu_load_e500mc,
+	.vcpu_put    = kvmppc_core_vcpu_put_e500mc,
+	.vcpu_create = kvmppc_core_vcpu_create_e500mc,
+	.vcpu_free   = kvmppc_core_vcpu_free_e500mc,
+	.init_vm = kvmppc_core_init_vm_e500mc,
+	.destroy_vm = kvmppc_core_destroy_vm_e500mc,
+	.emulate_op = kvmppc_core_emulate_op_e500,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_e500,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_e500,
+	.create_vcpu_debugfs = kvmppc_create_vcpu_debugfs_e500,
+};
+
+static int __init kvmppc_e500mc_init(void)
+{
+	int r;
+
+	r = kvmppc_e500mc_check_processor_compat();
+	if (r)
+		goto err_out;
+
+	r = kvmppc_booke_init();
+	if (r)
+		goto err_out;
+
+	/*
+	 * Use two lpids per VM on dual threaded processors like e6500
+	 * to workarround the lack of tlb write conditional instruction.
+	 * Expose half the number of available hardware lpids to the lpid
+	 * allocator.
+	 */
+	kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core);
+
+	r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	if (r)
+		goto err_out;
+	kvm_ops_e500mc.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_e500mc;
+
+err_out:
+	return r;
+}
+
+static void __exit kvmppc_e500mc_exit(void)
+{
+	kvmppc_pr_ops = NULL;
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500mc_init);
+module_exit(kvmppc_e500mc_exit);
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
new file mode 100644
index 0000000000..355d5206e8
--- /dev/null
+++ b/arch/powerpc/kvm/emulate.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <linux/clockchips.h>
+
+#include <asm/reg.h>
+#include <asm/time.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include "timing.h"
+#include "trace.h"
+
+void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
+{
+	unsigned long dec_nsec;
+	unsigned long long dec_time;
+
+	pr_debug("mtDEC: %lx\n", vcpu->arch.dec);
+	hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+#ifdef CONFIG_PPC_BOOK3S
+	/* mtdec lowers the interrupt line when positive. */
+	kvmppc_core_dequeue_dec(vcpu);
+#endif
+
+#ifdef CONFIG_BOOKE
+	/* On BOOKE, DEC = 0 is as good as decrementer not enabled */
+	if (vcpu->arch.dec == 0)
+		return;
+#endif
+
+	/*
+	 * The decrementer ticks at the same rate as the timebase, so
+	 * that's how we convert the guest DEC value to the number of
+	 * host ticks.
+	 */
+
+	dec_time = vcpu->arch.dec;
+	/*
+	 * Guest timebase ticks at the same frequency as host timebase.
+	 * So use the host timebase calculations for decrementer emulation.
+	 */
+	dec_time = tb_to_ns(dec_time);
+	dec_nsec = do_div(dec_time, NSEC_PER_SEC);
+	hrtimer_start(&vcpu->arch.dec_timer,
+		ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL);
+	vcpu->arch.dec_jiffies = get_tb();
+}
+
+u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
+{
+	u64 jd = tb - vcpu->arch.dec_jiffies;
+
+#ifdef CONFIG_BOOKE
+	if (vcpu->arch.dec < jd)
+		return 0;
+#endif
+
+	return vcpu->arch.dec - jd;
+}
+
+static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		kvmppc_set_srr0(vcpu, spr_val);
+		break;
+	case SPRN_SRR1:
+		kvmppc_set_srr1(vcpu, spr_val);
+		break;
+
+	/* XXX We need to context-switch the timebase for
+	 * watchdog and FIT. */
+	case SPRN_TBWL: break;
+	case SPRN_TBWU: break;
+
+	case SPRN_DEC:
+		vcpu->arch.dec = (u32) spr_val;
+		kvmppc_emulate_dec(vcpu);
+		break;
+
+	case SPRN_SPRG0:
+		kvmppc_set_sprg0(vcpu, spr_val);
+		break;
+	case SPRN_SPRG1:
+		kvmppc_set_sprg1(vcpu, spr_val);
+		break;
+	case SPRN_SPRG2:
+		kvmppc_set_sprg2(vcpu, spr_val);
+		break;
+	case SPRN_SPRG3:
+		kvmppc_set_sprg3(vcpu, spr_val);
+		break;
+
+	/* PIR can legally be written, but we ignore it */
+	case SPRN_PIR: break;
+
+	default:
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn,
+								  spr_val);
+		if (emulated == EMULATE_FAIL)
+			printk(KERN_INFO "mtspr: unknown spr "
+				"0x%x\n", sprn);
+		break;
+	}
+
+	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = 0;
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		spr_val = kvmppc_get_srr0(vcpu);
+		break;
+	case SPRN_SRR1:
+		spr_val = kvmppc_get_srr1(vcpu);
+		break;
+	case SPRN_PVR:
+		spr_val = vcpu->arch.pvr;
+		break;
+	case SPRN_PIR:
+		spr_val = vcpu->vcpu_id;
+		break;
+
+	/* Note: mftb and TBRL/TBWL are user-accessible, so
+	 * the guest can always access the real TB anyways.
+	 * In fact, we probably will never see these traps. */
+	case SPRN_TBWL:
+		spr_val = get_tb() >> 32;
+		break;
+	case SPRN_TBWU:
+		spr_val = get_tb();
+		break;
+
+	case SPRN_SPRG0:
+		spr_val = kvmppc_get_sprg0(vcpu);
+		break;
+	case SPRN_SPRG1:
+		spr_val = kvmppc_get_sprg1(vcpu);
+		break;
+	case SPRN_SPRG2:
+		spr_val = kvmppc_get_sprg2(vcpu);
+		break;
+	case SPRN_SPRG3:
+		spr_val = kvmppc_get_sprg3(vcpu);
+		break;
+	/* Note: SPRG4-7 are user-readable, so we don't get
+	 * a trap. */
+
+	case SPRN_DEC:
+		spr_val = kvmppc_get_dec(vcpu, get_tb());
+		break;
+	default:
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn,
+								  &spr_val);
+		if (unlikely(emulated == EMULATE_FAIL)) {
+			printk(KERN_INFO "mfspr: unknown spr "
+				"0x%x\n", sprn);
+		}
+		break;
+	}
+
+	if (emulated == EMULATE_DONE)
+		kvmppc_set_gpr(vcpu, rt, spr_val);
+	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+
+	return emulated;
+}
+
+/* XXX Should probably auto-generate instruction decoding for a particular core
+ * from opcode tables in the future. */
+int kvmppc_emulate_instruction(struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+	ppc_inst_t pinst;
+	int rs, rt, sprn;
+	enum emulation_result emulated;
+	int advance = 1;
+
+	/* this default type might be overwritten by subcategories */
+	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
+
+	emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst);
+	inst = ppc_inst_val(pinst);
+	if (emulated != EMULATE_DONE)
+		return emulated;
+
+	pr_debug("Emulating opcode %d / %d\n", get_op(inst), get_xop(inst));
+
+	rs = get_rs(inst);
+	rt = get_rt(inst);
+	sprn = get_sprn(inst);
+
+	switch (get_op(inst)) {
+	case OP_TRAP:
+#ifdef CONFIG_PPC_BOOK3S
+	case OP_TRAP_64:
+		kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
+#else
+		kvmppc_core_queue_program(vcpu,
+					  vcpu->arch.shared->esr | ESR_PTR);
+#endif
+		advance = 0;
+		break;
+
+	case 31:
+		switch (get_xop(inst)) {
+
+		case OP_31_XOP_TRAP:
+#ifdef CONFIG_64BIT
+		case OP_31_XOP_TRAP_64:
+#endif
+#ifdef CONFIG_PPC_BOOK3S
+			kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
+#else
+			kvmppc_core_queue_program(vcpu,
+					vcpu->arch.shared->esr | ESR_PTR);
+#endif
+			advance = 0;
+			break;
+
+		case OP_31_XOP_MFSPR:
+			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
+			break;
+
+		case OP_31_XOP_MTSPR:
+			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
+			break;
+
+		case OP_31_XOP_TLBSYNC:
+			break;
+
+		default:
+			/* Attempt core-specific emulation below. */
+			emulated = EMULATE_FAIL;
+		}
+		break;
+
+	case 0:
+		/*
+		 * Instruction with primary opcode 0. Based on PowerISA
+		 * these are illegal instructions.
+		 */
+		if (inst == KVMPPC_INST_SW_BREAKPOINT) {
+			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+			vcpu->run->debug.arch.status = 0;
+			vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
+			emulated = EMULATE_EXIT_USER;
+			advance = 0;
+		} else
+			emulated = EMULATE_FAIL;
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL) {
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_op(vcpu, inst,
+							       &advance);
+		if (emulated == EMULATE_AGAIN) {
+			advance = 0;
+		} else if (emulated == EMULATE_FAIL) {
+			advance = 0;
+			printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
+			       "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+		}
+	}
+
+	trace_kvm_ppc_instr(inst, kvmppc_get_pc(vcpu), emulated);
+
+	/* Advance past emulated instruction. */
+	/*
+	 * If this ever handles prefixed instructions, the 4
+	 * will need to become ppc_inst_len(pinst) instead.
+	 */
+	if (advance)
+		kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+
+	return emulated;
+}
+EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction);
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
new file mode 100644
index 0000000000..059c08ae03
--- /dev/null
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <linux/clockchips.h>
+
+#include <asm/reg.h>
+#include <asm/time.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include <asm/sstep.h>
+#include "timing.h"
+#include "trace.h"
+
+#ifdef CONFIG_PPC_FPU
+static bool kvmppc_check_fp_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
+		kvmppc_core_queue_fpunavail(vcpu, kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_PPC_FPU */
+
+#ifdef CONFIG_VSX
+static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_VSX)) {
+		kvmppc_core_queue_vsx_unavail(vcpu, kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_ALTIVEC
+static bool kvmppc_check_altivec_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_VEC)) {
+		kvmppc_core_queue_vec_unavail(vcpu, kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_ALTIVEC */
+
+/*
+ * XXX to do:
+ * lfiwax, lfiwzx
+ * vector loads and stores
+ *
+ * Instructions that trap when used on cache-inhibited mappings
+ * are not emulated here: multiple and string instructions,
+ * lq/stq, and the load-reserve/store-conditional instructions.
+ */
+int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
+{
+	ppc_inst_t inst;
+	enum emulation_result emulated = EMULATE_FAIL;
+	struct instruction_op op;
+
+	/* this default type might be overwritten by subcategories */
+	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
+
+	emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst);
+	if (emulated != EMULATE_DONE)
+		return emulated;
+
+	vcpu->arch.mmio_vsx_copy_nums = 0;
+	vcpu->arch.mmio_vsx_offset = 0;
+	vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
+	vcpu->arch.mmio_sp64_extend = 0;
+	vcpu->arch.mmio_sign_extend = 0;
+	vcpu->arch.mmio_vmx_copy_nums = 0;
+	vcpu->arch.mmio_vmx_offset = 0;
+	vcpu->arch.mmio_host_swabbed = 0;
+
+	emulated = EMULATE_FAIL;
+	vcpu->arch.regs.msr = vcpu->arch.shared->msr;
+	if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
+		int type = op.type & INSTR_TYPE_MASK;
+		int size = GETSIZE(op.type);
+
+		vcpu->mmio_is_write = OP_IS_STORE(type);
+
+		switch (type) {
+		case LOAD:  {
+			int instr_byte_swap = op.type & BYTEREV;
+
+			if (op.type & SIGNEXT)
+				emulated = kvmppc_handle_loads(vcpu,
+						op.reg, size, !instr_byte_swap);
+			else
+				emulated = kvmppc_handle_load(vcpu,
+						op.reg, size, !instr_byte_swap);
+
+			if ((op.type & UPDATE) && (emulated != EMULATE_FAIL))
+				kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
+
+			break;
+		}
+#ifdef CONFIG_PPC_FPU
+		case LOAD_FP:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+
+			if (op.type & FPCONV)
+				vcpu->arch.mmio_sp64_extend = 1;
+
+			if (op.type & SIGNEXT)
+				emulated = kvmppc_handle_loads(vcpu,
+					     KVM_MMIO_REG_FPR|op.reg, size, 1);
+			else
+				emulated = kvmppc_handle_load(vcpu,
+					     KVM_MMIO_REG_FPR|op.reg, size, 1);
+
+			if ((op.type & UPDATE) && (emulated != EMULATE_FAIL))
+				kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
+
+			break;
+#endif
+#ifdef CONFIG_ALTIVEC
+		case LOAD_VMX:
+			if (kvmppc_check_altivec_disabled(vcpu))
+				return EMULATE_DONE;
+
+			/* Hardware enforces alignment of VMX accesses */
+			vcpu->arch.vaddr_accessed &= ~((unsigned long)size - 1);
+			vcpu->arch.paddr_accessed &= ~((unsigned long)size - 1);
+
+			if (size == 16) { /* lvx */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_DWORD;
+			} else if (size == 4) { /* lvewx  */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_WORD;
+			} else if (size == 2) { /* lvehx  */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_HWORD;
+			} else if (size == 1) { /* lvebx  */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_BYTE;
+			} else
+				break;
+
+			vcpu->arch.mmio_vmx_offset =
+				(vcpu->arch.vaddr_accessed & 0xf)/size;
+
+			if (size == 16) {
+				vcpu->arch.mmio_vmx_copy_nums = 2;
+				emulated = kvmppc_handle_vmx_load(vcpu,
+						KVM_MMIO_REG_VMX|op.reg,
+						8, 1);
+			} else {
+				vcpu->arch.mmio_vmx_copy_nums = 1;
+				emulated = kvmppc_handle_vmx_load(vcpu,
+						KVM_MMIO_REG_VMX|op.reg,
+						size, 1);
+			}
+			break;
+#endif
+#ifdef CONFIG_VSX
+		case LOAD_VSX: {
+			int io_size_each;
+
+			if (op.vsx_flags & VSX_CHECK_VEC) {
+				if (kvmppc_check_altivec_disabled(vcpu))
+					return EMULATE_DONE;
+			} else {
+				if (kvmppc_check_vsx_disabled(vcpu))
+					return EMULATE_DONE;
+			}
+
+			if (op.vsx_flags & VSX_FPCONV)
+				vcpu->arch.mmio_sp64_extend = 1;
+
+			if (op.element_size == 8)  {
+				if (op.vsx_flags & VSX_SPLAT)
+					vcpu->arch.mmio_copy_type =
+						KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
+				else
+					vcpu->arch.mmio_copy_type =
+						KVMPPC_VSX_COPY_DWORD;
+			} else if (op.element_size == 4) {
+				if (op.vsx_flags & VSX_SPLAT)
+					vcpu->arch.mmio_copy_type =
+						KVMPPC_VSX_COPY_WORD_LOAD_DUMP;
+				else
+					vcpu->arch.mmio_copy_type =
+						KVMPPC_VSX_COPY_WORD;
+			} else
+				break;
+
+			if (size < op.element_size) {
+				/* precision convert case: lxsspx, etc */
+				vcpu->arch.mmio_vsx_copy_nums = 1;
+				io_size_each = size;
+			} else { /* lxvw4x, lxvd2x, etc */
+				vcpu->arch.mmio_vsx_copy_nums =
+					size/op.element_size;
+				io_size_each = op.element_size;
+			}
+
+			emulated = kvmppc_handle_vsx_load(vcpu,
+					KVM_MMIO_REG_VSX|op.reg, io_size_each,
+					1, op.type & SIGNEXT);
+			break;
+		}
+#endif
+		case STORE:
+			/* if need byte reverse, op.val has been reversed by
+			 * analyse_instr().
+			 */
+			emulated = kvmppc_handle_store(vcpu, op.val, size, 1);
+
+			if ((op.type & UPDATE) && (emulated != EMULATE_FAIL))
+				kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
+
+			break;
+#ifdef CONFIG_PPC_FPU
+		case STORE_FP:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+
+			/* The FP registers need to be flushed so that
+			 * kvmppc_handle_store() can read actual FP vals
+			 * from vcpu->arch.
+			 */
+			if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+				vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu,
+						MSR_FP);
+
+			if (op.type & FPCONV)
+				vcpu->arch.mmio_sp64_extend = 1;
+
+			emulated = kvmppc_handle_store(vcpu,
+					VCPU_FPR(vcpu, op.reg), size, 1);
+
+			if ((op.type & UPDATE) && (emulated != EMULATE_FAIL))
+				kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
+
+			break;
+#endif
+#ifdef CONFIG_ALTIVEC
+		case STORE_VMX:
+			if (kvmppc_check_altivec_disabled(vcpu))
+				return EMULATE_DONE;
+
+			/* Hardware enforces alignment of VMX accesses. */
+			vcpu->arch.vaddr_accessed &= ~((unsigned long)size - 1);
+			vcpu->arch.paddr_accessed &= ~((unsigned long)size - 1);
+
+			if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+				vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu,
+						MSR_VEC);
+			if (size == 16) { /* stvx */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_DWORD;
+			} else if (size == 4) { /* stvewx  */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_WORD;
+			} else if (size == 2) { /* stvehx  */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_HWORD;
+			} else if (size == 1) { /* stvebx  */
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VMX_COPY_BYTE;
+			} else
+				break;
+
+			vcpu->arch.mmio_vmx_offset =
+				(vcpu->arch.vaddr_accessed & 0xf)/size;
+
+			if (size == 16) {
+				vcpu->arch.mmio_vmx_copy_nums = 2;
+				emulated = kvmppc_handle_vmx_store(vcpu,
+						op.reg, 8, 1);
+			} else {
+				vcpu->arch.mmio_vmx_copy_nums = 1;
+				emulated = kvmppc_handle_vmx_store(vcpu,
+						op.reg, size, 1);
+			}
+
+			break;
+#endif
+#ifdef CONFIG_VSX
+		case STORE_VSX: {
+			int io_size_each;
+
+			if (op.vsx_flags & VSX_CHECK_VEC) {
+				if (kvmppc_check_altivec_disabled(vcpu))
+					return EMULATE_DONE;
+			} else {
+				if (kvmppc_check_vsx_disabled(vcpu))
+					return EMULATE_DONE;
+			}
+
+			if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+				vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu,
+						MSR_VSX);
+
+			if (op.vsx_flags & VSX_FPCONV)
+				vcpu->arch.mmio_sp64_extend = 1;
+
+			if (op.element_size == 8)
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VSX_COPY_DWORD;
+			else if (op.element_size == 4)
+				vcpu->arch.mmio_copy_type =
+						KVMPPC_VSX_COPY_WORD;
+			else
+				break;
+
+			if (size < op.element_size) {
+				/* precise conversion case, like stxsspx */
+				vcpu->arch.mmio_vsx_copy_nums = 1;
+				io_size_each = size;
+			} else { /* stxvw4x, stxvd2x, etc */
+				vcpu->arch.mmio_vsx_copy_nums =
+						size/op.element_size;
+				io_size_each = op.element_size;
+			}
+
+			emulated = kvmppc_handle_vsx_store(vcpu,
+					op.reg, io_size_each, 1);
+			break;
+		}
+#endif
+		case CACHEOP:
+			/* Do nothing. The guest is performing dcbi because
+			 * hardware DMA is not snooped by the dcache, but
+			 * emulated DMA either goes through the dcache as
+			 * normal writes, or the host kernel has handled dcache
+			 * coherence.
+			 */
+			emulated = EMULATE_DONE;
+			break;
+		default:
+			break;
+		}
+	}
+
+	trace_kvm_ppc_instr(ppc_inst_val(inst), kvmppc_get_pc(vcpu), emulated);
+
+	/* Advance past emulated instruction. */
+	if (emulated != EMULATE_FAIL)
+		kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + ppc_inst_len(inst));
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
new file mode 100644
index 0000000000..b68e7f26a8
--- /dev/null
+++ b/arch/powerpc/kvm/fpu.S
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  FPU helper code to use FPU operations from inside the kernel
+ *
+ *    Copyright (C) 2010 Alexander Graf (agraf@suse.de)
+ */
+
+#include <linux/pgtable.h>
+#include <linux/linkage.h>
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+/* Instructions operating on single parameters */
+
+/*
+ * Single operation with one input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ */
+#define FPS_ONE_IN(name) 					\
+_GLOBAL(fps_ ## name);							\
+	lfd	0,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(0);							\
+	lfs	0,0(r5);						\
+									\
+	name	0,0;							\
+									\
+	stfs	0,0(r4);						\
+	mffs	0;							\
+	stfd	0,0(r3);	/* save new fpscr value */	\
+	blr
+
+/*
+ * Single operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ * R6 = (short*)&param2
+ */
+#define FPS_TWO_IN(name) 					\
+_GLOBAL(fps_ ## name);							\
+	lfd	0,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(0);							\
+	lfs	0,0(r5);						\
+	lfs	1,0(r6);						\
+									\
+	name	0,0,1;							\
+									\
+	stfs	0,0(r4);						\
+	mffs	0;							\
+	stfd	0,0(r3);		/* save new fpscr value */	\
+	blr
+
+/*
+ * Single operation with three input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ * R6 = (short*)&param2
+ * R7 = (short*)&param3
+ */
+#define FPS_THREE_IN(name) 					\
+_GLOBAL(fps_ ## name);							\
+	lfd	0,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(0);							\
+	lfs	0,0(r5);						\
+	lfs	1,0(r6);						\
+	lfs	2,0(r7);						\
+									\
+	name	0,0,1,2;						\
+									\
+	stfs	0,0(r4);						\
+	mffs	0;							\
+	stfd	0,0(r3);		/* save new fpscr value */	\
+	blr
+
+FPS_ONE_IN(fres)
+FPS_ONE_IN(frsqrte)
+FPS_ONE_IN(fsqrts)
+FPS_TWO_IN(fadds)
+FPS_TWO_IN(fdivs)
+FPS_TWO_IN(fmuls)
+FPS_TWO_IN(fsubs)
+FPS_THREE_IN(fmadds)
+FPS_THREE_IN(fmsubs)
+FPS_THREE_IN(fnmadds)
+FPS_THREE_IN(fnmsubs)
+FPS_THREE_IN(fsel)
+
+
+/* Instructions operating on double parameters */
+
+/*
+ * Beginning of double instruction processing
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2 [load_two]
+ * R8 = (double*)&param3 [load_three]
+ * LR = instruction call function
+ */
+SYM_FUNC_START_LOCAL(fpd_load_three)
+	lfd	2,0(r8)			/* load param3 */
+SYM_FUNC_START_LOCAL(fpd_load_two)
+	lfd	1,0(r7)			/* load param2 */
+SYM_FUNC_START_LOCAL(fpd_load_one)
+	lfd	0,0(r6)			/* load param1 */
+SYM_FUNC_START_LOCAL(fpd_load_none)
+	lfd	3,0(r3)			/* load up fpscr value */
+	MTFSF_L(3)
+	lwz	r6, 0(r4)		/* load cr */
+	mtcr	r6
+	blr
+SYM_FUNC_END(fpd_load_none)
+SYM_FUNC_END(fpd_load_one)
+SYM_FUNC_END(fpd_load_two)
+SYM_FUNC_END(fpd_load_three)
+
+/*
+ * End of double instruction processing
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * LR = caller of instruction call function
+ */
+SYM_FUNC_START_LOCAL(fpd_return)
+	mfcr	r6
+	stfd	0,0(r5)			/* save result */
+	mffs	0
+	stfd	0,0(r3)			/* save new fpscr value */
+	stw	r6,0(r4)		/* save new cr value */
+	blr
+SYM_FUNC_END(fpd_return)
+
+/*
+ * Double operation with no input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ */
+#define FPD_NONE_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_none;						\
+	mtlr	r12;							\
+									\
+	name.	0;			/* call instruction */		\
+	b	fpd_return
+
+/*
+ * Double operation with one input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ */
+#define FPD_ONE_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_one;						\
+	mtlr	r12;							\
+									\
+	name.	0,0;			/* call instruction */		\
+	b	fpd_return
+
+/*
+ * Double operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2
+ * R8 = (double*)&param3
+ */
+#define FPD_TWO_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_two;						\
+	mtlr	r12;							\
+									\
+	name.	0,0,1;			/* call instruction */		\
+	b	fpd_return
+
+/*
+ * CR Double operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&param1
+ * R6 = (double*)&param2
+ * R7 = (double*)&param3
+ */
+#define FPD_TWO_IN_CR(name)						\
+_GLOBAL(fpd_ ## name);							\
+	lfd	1,0(r6);		/* load param2 */		\
+	lfd	0,0(r5);		/* load param1 */		\
+	lfd	3,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(3);							\
+	lwz	r6, 0(r4);		/* load cr */			\
+	mtcr	r6;							\
+									\
+	name	0,0,1;			/* call instruction */		\
+	mfcr	r6;							\
+	mffs	0;							\
+	stfd	0,0(r3);		/* save new fpscr value */	\
+	stw	r6,0(r4);		/* save new cr value */		\
+	blr
+
+/*
+ * Double operation with three input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2
+ * R8 = (double*)&param3
+ */
+#define FPD_THREE_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_three;						\
+	mtlr	r12;							\
+									\
+	name.	0,0,1,2;		/* call instruction */		\
+	b	fpd_return
+
+FPD_ONE_IN(fsqrts)
+FPD_ONE_IN(frsqrtes)
+FPD_ONE_IN(fres)
+FPD_ONE_IN(frsp)
+FPD_ONE_IN(fctiw)
+FPD_ONE_IN(fctiwz)
+FPD_ONE_IN(fsqrt)
+FPD_ONE_IN(fre)
+FPD_ONE_IN(frsqrte)
+FPD_ONE_IN(fneg)
+FPD_ONE_IN(fabs)
+FPD_TWO_IN(fadds)
+FPD_TWO_IN(fsubs)
+FPD_TWO_IN(fdivs)
+FPD_TWO_IN(fmuls)
+FPD_TWO_IN_CR(fcmpu)
+FPD_TWO_IN(fcpsgn)
+FPD_TWO_IN(fdiv)
+FPD_TWO_IN(fadd)
+FPD_TWO_IN(fmul)
+FPD_TWO_IN_CR(fcmpo)
+FPD_TWO_IN(fsub)
+FPD_THREE_IN(fmsubs)
+FPD_THREE_IN(fmadds)
+FPD_THREE_IN(fnmsubs)
+FPD_THREE_IN(fnmadds)
+FPD_THREE_IN(fsel)
+FPD_THREE_IN(fmsub)
+FPD_THREE_IN(fmadd)
+FPD_THREE_IN(fnmsub)
+FPD_THREE_IN(fnmadd)
+
+_GLOBAL(kvm_cvt_fd)
+	lfs	0,0(r3)
+	stfd	0,0(r4)
+	blr
+
+_GLOBAL(kvm_cvt_df)
+	lfd	0,0(r3)
+	stfs	0,0(r4)
+	blr
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644
index 0000000000..23e9c2bd9f
--- /dev/null
+++ b/arch/powerpc/kvm/mpic.c
@@ -0,0 +1,1852 @@
+/*
+ * OpenPIC emulation
+ *
+ * Copyright (c) 2004 Jocelyn Mayer
+ *               2011 Alexander Graf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_ppc.h>
+#include <kvm/iodev.h>
+
+#define MAX_CPU     32
+#define MAX_SRC     256
+#define MAX_TMR     4
+#define MAX_IPI     4
+#define MAX_MSI     8
+#define MAX_IRQ     (MAX_SRC + MAX_IPI + MAX_TMR)
+#define VID         0x03	/* MPIC version ID */
+
+/* OpenPIC capability flags */
+#define OPENPIC_FLAG_IDR_CRIT     (1 << 0)
+#define OPENPIC_FLAG_ILR          (2 << 0)
+
+/* OpenPIC address map */
+#define OPENPIC_REG_SIZE             0x40000
+#define OPENPIC_GLB_REG_START        0x0
+#define OPENPIC_GLB_REG_SIZE         0x10F0
+#define OPENPIC_TMR_REG_START        0x10F0
+#define OPENPIC_TMR_REG_SIZE         0x220
+#define OPENPIC_MSI_REG_START        0x1600
+#define OPENPIC_MSI_REG_SIZE         0x200
+#define OPENPIC_SUMMARY_REG_START    0x3800
+#define OPENPIC_SUMMARY_REG_SIZE     0x800
+#define OPENPIC_SRC_REG_START        0x10000
+#define OPENPIC_SRC_REG_SIZE         (MAX_SRC * 0x20)
+#define OPENPIC_CPU_REG_START        0x20000
+#define OPENPIC_CPU_REG_SIZE         (0x100 + ((MAX_CPU - 1) * 0x1000))
+
+struct fsl_mpic_info {
+	int max_ext;
+};
+
+static struct fsl_mpic_info fsl_mpic_20 = {
+	.max_ext = 12,
+};
+
+static struct fsl_mpic_info fsl_mpic_42 = {
+	.max_ext = 12,
+};
+
+#define FRR_NIRQ_SHIFT    16
+#define FRR_NCPU_SHIFT     8
+#define FRR_VID_SHIFT      0
+
+#define VID_REVISION_1_2   2
+#define VID_REVISION_1_3   3
+
+#define VIR_GENERIC      0x00000000	/* Generic Vendor ID */
+
+#define GCR_RESET        0x80000000
+#define GCR_MODE_PASS    0x00000000
+#define GCR_MODE_MIXED   0x20000000
+#define GCR_MODE_PROXY   0x60000000
+
+#define TBCR_CI           0x80000000	/* count inhibit */
+#define TCCR_TOG          0x80000000	/* toggles when decrement to zero */
+
+#define IDR_EP_SHIFT      31
+#define IDR_EP_MASK       (1 << IDR_EP_SHIFT)
+#define IDR_CI0_SHIFT     30
+#define IDR_CI1_SHIFT     29
+#define IDR_P1_SHIFT      1
+#define IDR_P0_SHIFT      0
+
+#define ILR_INTTGT_MASK   0x000000ff
+#define ILR_INTTGT_INT    0x00
+#define ILR_INTTGT_CINT   0x01	/* critical */
+#define ILR_INTTGT_MCP    0x02	/* machine check */
+#define NUM_OUTPUTS       3
+
+#define MSIIR_OFFSET       0x140
+#define MSIIR_SRS_SHIFT    29
+#define MSIIR_SRS_MASK     (0x7 << MSIIR_SRS_SHIFT)
+#define MSIIR_IBS_SHIFT    24
+#define MSIIR_IBS_MASK     (0x1f << MSIIR_IBS_SHIFT)
+
+static int get_current_cpu(void)
+{
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+	return vcpu ? vcpu->arch.irq_cpu_id : -1;
+#else
+	/* XXX */
+	return -1;
+#endif
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx);
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+				    uint32_t val);
+
+enum irq_type {
+	IRQ_TYPE_NORMAL = 0,
+	IRQ_TYPE_FSLINT,	/* FSL internal interrupt -- level only */
+	IRQ_TYPE_FSLSPECIAL,	/* FSL timer/IPI interrupt, edge, no polarity */
+};
+
+struct irq_queue {
+	/* Round up to the nearest 64 IRQs so that the queue length
+	 * won't change when moving between 32 and 64 bit hosts.
+	 */
+	unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)];
+	int next;
+	int priority;
+};
+
+struct irq_source {
+	uint32_t ivpr;		/* IRQ vector/priority register */
+	uint32_t idr;		/* IRQ destination register */
+	uint32_t destmask;	/* bitmap of CPU destinations */
+	int last_cpu;
+	int output;		/* IRQ level, e.g. ILR_INTTGT_INT */
+	int pending;		/* TRUE if IRQ is pending */
+	enum irq_type type;
+	bool level:1;		/* level-triggered */
+	bool nomask:1;	/* critical interrupts ignore mask on some FSL MPICs */
+};
+
+#define IVPR_MASK_SHIFT       31
+#define IVPR_MASK_MASK        (1 << IVPR_MASK_SHIFT)
+#define IVPR_ACTIVITY_SHIFT   30
+#define IVPR_ACTIVITY_MASK    (1 << IVPR_ACTIVITY_SHIFT)
+#define IVPR_MODE_SHIFT       29
+#define IVPR_MODE_MASK        (1 << IVPR_MODE_SHIFT)
+#define IVPR_POLARITY_SHIFT   23
+#define IVPR_POLARITY_MASK    (1 << IVPR_POLARITY_SHIFT)
+#define IVPR_SENSE_SHIFT      22
+#define IVPR_SENSE_MASK       (1 << IVPR_SENSE_SHIFT)
+
+#define IVPR_PRIORITY_MASK     (0xF << 16)
+#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16))
+#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask)
+
+/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */
+#define IDR_EP      0x80000000	/* external pin */
+#define IDR_CI      0x40000000	/* critical interrupt */
+
+struct irq_dest {
+	struct kvm_vcpu *vcpu;
+
+	int32_t ctpr;		/* CPU current task priority */
+	struct irq_queue raised;
+	struct irq_queue servicing;
+
+	/* Count of IRQ sources asserting on non-INT outputs */
+	uint32_t outputs_active[NUM_OUTPUTS];
+};
+
+#define MAX_MMIO_REGIONS 10
+
+struct openpic {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct kvm_io_device mmio;
+	const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS];
+	int num_mmio_regions;
+
+	gpa_t reg_base;
+	spinlock_t lock;
+
+	/* Behavior control */
+	struct fsl_mpic_info *fsl;
+	uint32_t model;
+	uint32_t flags;
+	uint32_t nb_irqs;
+	uint32_t vid;
+	uint32_t vir;		/* Vendor identification register */
+	uint32_t vector_mask;
+	uint32_t tfrr_reset;
+	uint32_t ivpr_reset;
+	uint32_t idr_reset;
+	uint32_t brr1;
+	uint32_t mpic_mode_mask;
+
+	/* Global registers */
+	uint32_t frr;		/* Feature reporting register */
+	uint32_t gcr;		/* Global configuration register  */
+	uint32_t pir;		/* Processor initialization register */
+	uint32_t spve;		/* Spurious vector register */
+	uint32_t tfrr;		/* Timer frequency reporting register */
+	/* Source registers */
+	struct irq_source src[MAX_IRQ];
+	/* Local registers per output pin */
+	struct irq_dest dst[MAX_CPU];
+	uint32_t nb_cpus;
+	/* Timer registers */
+	struct {
+		uint32_t tccr;	/* Global timer current count register */
+		uint32_t tbcr;	/* Global timer base count register */
+	} timers[MAX_TMR];
+	/* Shared MSI registers */
+	struct {
+		uint32_t msir;	/* Shared Message Signaled Interrupt Register */
+	} msi[MAX_MSI];
+	uint32_t max_irq;
+	uint32_t irq_ipi0;
+	uint32_t irq_tim0;
+	uint32_t irq_msi;
+};
+
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	struct kvm_interrupt irq = {
+		.irq = KVM_INTERRUPT_SET_LEVEL,
+	};
+
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvmppc_core_dequeue_external(dst->vcpu);
+}
+
+static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
+{
+	set_bit(n_IRQ, q->queue);
+}
+
+static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
+{
+	clear_bit(n_IRQ, q->queue);
+}
+
+static void IRQ_check(struct openpic *opp, struct irq_queue *q)
+{
+	int irq = -1;
+	int next = -1;
+	int priority = -1;
+
+	for (;;) {
+		irq = find_next_bit(q->queue, opp->max_irq, irq + 1);
+		if (irq == opp->max_irq)
+			break;
+
+		pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n",
+			irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority);
+
+		if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) {
+			next = irq;
+			priority = IVPR_PRIORITY(opp->src[irq].ivpr);
+		}
+	}
+
+	q->next = next;
+	q->priority = priority;
+}
+
+static int IRQ_get_next(struct openpic *opp, struct irq_queue *q)
+{
+	/* XXX: optimize */
+	IRQ_check(opp, q);
+
+	return q->next;
+}
+
+static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
+			   bool active, bool was_active)
+{
+	struct irq_dest *dst;
+	struct irq_source *src;
+	int priority;
+
+	dst = &opp->dst[n_CPU];
+	src = &opp->src[n_IRQ];
+
+	pr_debug("%s: IRQ %d active %d was %d\n",
+		__func__, n_IRQ, active, was_active);
+
+	if (src->output != ILR_INTTGT_INT) {
+		pr_debug("%s: output %d irq %d active %d was %d count %d\n",
+			__func__, src->output, n_IRQ, active, was_active,
+			dst->outputs_active[src->output]);
+
+		/* On Freescale MPIC, critical interrupts ignore priority,
+		 * IACK, EOI, etc.  Before MPIC v4.1 they also ignore
+		 * masking.
+		 */
+		if (active) {
+			if (!was_active &&
+			    dst->outputs_active[src->output]++ == 0) {
+				pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
+					__func__, src->output, n_CPU, n_IRQ);
+				mpic_irq_raise(opp, dst, src->output);
+			}
+		} else {
+			if (was_active &&
+			    --dst->outputs_active[src->output] == 0) {
+				pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
+					__func__, src->output, n_CPU, n_IRQ);
+				mpic_irq_lower(opp, dst, src->output);
+			}
+		}
+
+		return;
+	}
+
+	priority = IVPR_PRIORITY(src->ivpr);
+
+	/* Even if the interrupt doesn't have enough priority,
+	 * it is still raised, in case ctpr is lowered later.
+	 */
+	if (active)
+		IRQ_setbit(&dst->raised, n_IRQ);
+	else
+		IRQ_resetbit(&dst->raised, n_IRQ);
+
+	IRQ_check(opp, &dst->raised);
+
+	if (active && priority <= dst->ctpr) {
+		pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n",
+			__func__, n_IRQ, priority, dst->ctpr, n_CPU);
+		active = 0;
+	}
+
+	if (active) {
+		if (IRQ_get_next(opp, &dst->servicing) >= 0 &&
+		    priority <= dst->servicing.priority) {
+			pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n",
+				__func__, n_IRQ, dst->servicing.next, n_CPU);
+		} else {
+			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
+				__func__, n_CPU, n_IRQ, dst->raised.next);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+	} else {
+		IRQ_get_next(opp, &dst->servicing);
+		if (dst->raised.priority > dst->ctpr &&
+		    dst->raised.priority > dst->servicing.priority) {
+			pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n",
+				__func__, n_IRQ, dst->raised.next,
+				dst->raised.priority, dst->ctpr,
+				dst->servicing.priority, n_CPU);
+			/* IRQ line stays asserted */
+		} else {
+			pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
+				__func__, n_IRQ, dst->ctpr,
+				dst->servicing.priority, n_CPU);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+		}
+	}
+}
+
+/* update pic state because registers for n_IRQ have changed value */
+static void openpic_update_irq(struct openpic *opp, int n_IRQ)
+{
+	struct irq_source *src;
+	bool active, was_active;
+	int i;
+
+	src = &opp->src[n_IRQ];
+	active = src->pending;
+
+	if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) {
+		/* Interrupt source is disabled */
+		pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ);
+		active = false;
+	}
+
+	was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK);
+
+	/*
+	 * We don't have a similar check for already-active because
+	 * ctpr may have changed and we need to withdraw the interrupt.
+	 */
+	if (!active && !was_active) {
+		pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ);
+		return;
+	}
+
+	if (active)
+		src->ivpr |= IVPR_ACTIVITY_MASK;
+	else
+		src->ivpr &= ~IVPR_ACTIVITY_MASK;
+
+	if (src->destmask == 0) {
+		/* No target */
+		pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ);
+		return;
+	}
+
+	if (src->destmask == (1 << src->last_cpu)) {
+		/* Only one CPU is allowed to receive this IRQ */
+		IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active);
+	} else if (!(src->ivpr & IVPR_MODE_MASK)) {
+		/* Directed delivery mode */
+		for (i = 0; i < opp->nb_cpus; i++) {
+			if (src->destmask & (1 << i)) {
+				IRQ_local_pipe(opp, i, n_IRQ, active,
+					       was_active);
+			}
+		}
+	} else {
+		/* Distributed delivery mode */
+		for (i = src->last_cpu + 1; i != src->last_cpu; i++) {
+			if (i == opp->nb_cpus)
+				i = 0;
+
+			if (src->destmask & (1 << i)) {
+				IRQ_local_pipe(opp, i, n_IRQ, active,
+					       was_active);
+				src->last_cpu = i;
+				break;
+			}
+		}
+	}
+}
+
+static void openpic_set_irq(void *opaque, int n_IRQ, int level)
+{
+	struct openpic *opp = opaque;
+	struct irq_source *src;
+
+	if (n_IRQ >= MAX_IRQ) {
+		WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+		return;
+	}
+
+	src = &opp->src[n_IRQ];
+	pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n",
+		n_IRQ, level, src->ivpr);
+	if (src->level) {
+		/* level-sensitive irq */
+		src->pending = level;
+		openpic_update_irq(opp, n_IRQ);
+	} else {
+		/* edge-sensitive irq */
+		if (level) {
+			src->pending = 1;
+			openpic_update_irq(opp, n_IRQ);
+		}
+
+		if (src->output != ILR_INTTGT_INT) {
+			/* Edge-triggered interrupts shouldn't be used
+			 * with non-INT delivery, but just in case,
+			 * try to make it do something sane rather than
+			 * cause an interrupt storm.  This is close to
+			 * what you'd probably see happen in real hardware.
+			 */
+			src->pending = 0;
+			openpic_update_irq(opp, n_IRQ);
+		}
+	}
+}
+
+static void openpic_reset(struct openpic *opp)
+{
+	int i;
+
+	opp->gcr = GCR_RESET;
+	/* Initialise controller registers */
+	opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
+	    (opp->vid << FRR_VID_SHIFT);
+
+	opp->pir = 0;
+	opp->spve = -1 & opp->vector_mask;
+	opp->tfrr = opp->tfrr_reset;
+	/* Initialise IRQ sources */
+	for (i = 0; i < opp->max_irq; i++) {
+		opp->src[i].ivpr = opp->ivpr_reset;
+
+		switch (opp->src[i].type) {
+		case IRQ_TYPE_NORMAL:
+			opp->src[i].level =
+			    !!(opp->ivpr_reset & IVPR_SENSE_MASK);
+			break;
+
+		case IRQ_TYPE_FSLINT:
+			opp->src[i].ivpr |= IVPR_POLARITY_MASK;
+			break;
+
+		case IRQ_TYPE_FSLSPECIAL:
+			break;
+		}
+
+		write_IRQreg_idr(opp, i, opp->idr_reset);
+	}
+	/* Initialise IRQ destinations */
+	for (i = 0; i < MAX_CPU; i++) {
+		opp->dst[i].ctpr = 15;
+		memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue));
+		opp->dst[i].raised.next = -1;
+		memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue));
+		opp->dst[i].servicing.next = -1;
+	}
+	/* Initialise timers */
+	for (i = 0; i < MAX_TMR; i++) {
+		opp->timers[i].tccr = 0;
+		opp->timers[i].tbcr = TBCR_CI;
+	}
+	/* Go out of RESET state */
+	opp->gcr = 0;
+}
+
+static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
+{
+	return opp->src[n_IRQ].idr;
+}
+
+static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
+{
+	if (opp->flags & OPENPIC_FLAG_ILR)
+		return opp->src[n_IRQ].output;
+
+	return 0xffffffff;
+}
+
+static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ)
+{
+	return opp->src[n_IRQ].ivpr;
+}
+
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+				    uint32_t val)
+{
+	struct irq_source *src = &opp->src[n_IRQ];
+	uint32_t normal_mask = (1UL << opp->nb_cpus) - 1;
+	uint32_t crit_mask = 0;
+	uint32_t mask = normal_mask;
+	int crit_shift = IDR_EP_SHIFT - opp->nb_cpus;
+	int i;
+
+	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+		crit_mask = mask << crit_shift;
+		mask |= crit_mask | IDR_EP;
+	}
+
+	src->idr = val & mask;
+	pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr);
+
+	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+		if (src->idr & crit_mask) {
+			if (src->idr & normal_mask) {
+				pr_debug("%s: IRQ configured for multiple output types, using critical\n",
+					__func__);
+			}
+
+			src->output = ILR_INTTGT_CINT;
+			src->nomask = true;
+			src->destmask = 0;
+
+			for (i = 0; i < opp->nb_cpus; i++) {
+				int n_ci = IDR_CI0_SHIFT - i;
+
+				if (src->idr & (1UL << n_ci))
+					src->destmask |= 1UL << i;
+			}
+		} else {
+			src->output = ILR_INTTGT_INT;
+			src->nomask = false;
+			src->destmask = src->idr & normal_mask;
+		}
+	} else {
+		src->destmask = src->idr;
+	}
+}
+
+static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
+				    uint32_t val)
+{
+	if (opp->flags & OPENPIC_FLAG_ILR) {
+		struct irq_source *src = &opp->src[n_IRQ];
+
+		src->output = val & ILR_INTTGT_MASK;
+		pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
+			src->output);
+
+		/* TODO: on MPIC v4.0 only, set nomask for non-INT */
+	}
+}
+
+static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
+				     uint32_t val)
+{
+	uint32_t mask;
+
+	/* NOTE when implementing newer FSL MPIC models: starting with v4.0,
+	 * the polarity bit is read-only on internal interrupts.
+	 */
+	mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK |
+	    IVPR_POLARITY_MASK | opp->vector_mask;
+
+	/* ACTIVITY bit is read-only */
+	opp->src[n_IRQ].ivpr =
+	    (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask);
+
+	/* For FSL internal interrupts, The sense bit is reserved and zero,
+	 * and the interrupt is always level-triggered.  Timers and IPIs
+	 * have no sense or polarity bits, and are edge-triggered.
+	 */
+	switch (opp->src[n_IRQ].type) {
+	case IRQ_TYPE_NORMAL:
+		opp->src[n_IRQ].level =
+		    !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK);
+		break;
+
+	case IRQ_TYPE_FSLINT:
+		opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK;
+		break;
+
+	case IRQ_TYPE_FSLSPECIAL:
+		opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK);
+		break;
+	}
+
+	openpic_update_irq(opp, n_IRQ);
+	pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val,
+		opp->src[n_IRQ].ivpr);
+}
+
+static void openpic_gcr_write(struct openpic *opp, uint64_t val)
+{
+	if (val & GCR_RESET) {
+		openpic_reset(opp);
+		return;
+	}
+
+	opp->gcr &= ~opp->mpic_mode_mask;
+	opp->gcr |= val & opp->mpic_mode_mask;
+}
+
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int err = 0;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	switch (addr) {
+	case 0x00:	/* Block Revision Register1 (BRR1) is Readonly */
+		break;
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:
+	case 0x80:
+	case 0x90:
+	case 0xA0:
+	case 0xB0:
+		err = openpic_cpu_write_internal(opp, addr, val,
+						 get_current_cpu());
+		break;
+	case 0x1000:		/* FRR */
+		break;
+	case 0x1020:		/* GCR */
+		openpic_gcr_write(opp, val);
+		break;
+	case 0x1080:		/* VIR */
+		break;
+	case 0x1090:		/* PIR */
+		/*
+		 * This register is used to reset a CPU core --
+		 * let userspace handle it.
+		 */
+		err = -ENXIO;
+		break;
+	case 0x10A0:		/* IPI_IVPR */
+	case 0x10B0:
+	case 0x10C0:
+	case 0x10D0: {
+		int idx;
+		idx = (addr - 0x10A0) >> 4;
+		write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val);
+		break;
+	}
+	case 0x10E0:		/* SPVE */
+		opp->spve = val & opp->vector_mask;
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	u32 retval;
+	int err = 0;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	retval = 0xFFFFFFFF;
+	if (addr & 0xF)
+		goto out;
+
+	switch (addr) {
+	case 0x1000:		/* FRR */
+		retval = opp->frr;
+		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
+		break;
+	case 0x1020:		/* GCR */
+		retval = opp->gcr;
+		break;
+	case 0x1080:		/* VIR */
+		retval = opp->vir;
+		break;
+	case 0x1090:		/* PIR */
+		retval = 0x00000000;
+		break;
+	case 0x00:		/* Block Revision Register1 (BRR1) */
+		retval = opp->brr1;
+		break;
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:
+	case 0x80:
+	case 0x90:
+	case 0xA0:
+	case 0xB0:
+		err = openpic_cpu_read_internal(opp, addr,
+			&retval, get_current_cpu());
+		break;
+	case 0x10A0:		/* IPI_IVPR */
+	case 0x10B0:
+	case 0x10C0:
+	case 0x10D0:
+		{
+			int idx;
+			idx = (addr - 0x10A0) >> 4;
+			retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx);
+		}
+		break;
+	case 0x10E0:		/* SPVE */
+		retval = opp->spve;
+		break;
+	default:
+		break;
+	}
+
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return err;
+}
+
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx;
+
+	addr += 0x10f0;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	if (addr == 0x10f0) {
+		/* TFRR */
+		opp->tfrr = val;
+		return 0;
+	}
+
+	idx = (addr >> 6) & 0x3;
+	addr = addr & 0x30;
+
+	switch (addr & 0x30) {
+	case 0x00:		/* TCCR */
+		break;
+	case 0x10:		/* TBCR */
+		if ((opp->timers[idx].tccr & TCCR_TOG) != 0 &&
+		    (val & TBCR_CI) == 0 &&
+		    (opp->timers[idx].tbcr & TBCR_CI) != 0)
+			opp->timers[idx].tccr &= ~TCCR_TOG;
+
+		opp->timers[idx].tbcr = val;
+		break;
+	case 0x20:		/* TVPR */
+		write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val);
+		break;
+	case 0x30:		/* TDR */
+		write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t retval = -1;
+	int idx;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	if (addr & 0xF)
+		goto out;
+
+	idx = (addr >> 6) & 0x3;
+	if (addr == 0x0) {
+		/* TFRR */
+		retval = opp->tfrr;
+		goto out;
+	}
+
+	switch (addr & 0x30) {
+	case 0x00:		/* TCCR */
+		retval = opp->timers[idx].tccr;
+		break;
+	case 0x10:		/* TBCR */
+		retval = opp->timers[idx].tbcr;
+		break;
+	case 0x20:		/* TIPV */
+		retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx);
+		break;
+	case 0x30:		/* TIDE (TIDR) */
+		retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx);
+		break;
+	}
+
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+
+	addr = addr & 0xffff;
+	idx = addr >> 5;
+
+	switch (addr & 0x1f) {
+	case 0x00:
+		write_IRQreg_ivpr(opp, idx, val);
+		break;
+	case 0x10:
+		write_IRQreg_idr(opp, idx, val);
+		break;
+	case 0x18:
+		write_IRQreg_ilr(opp, idx, val);
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t retval;
+	int idx;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	retval = 0xFFFFFFFF;
+
+	addr = addr & 0xffff;
+	idx = addr >> 5;
+
+	switch (addr & 0x1f) {
+	case 0x00:
+		retval = read_IRQreg_ivpr(opp, idx);
+		break;
+	case 0x10:
+		retval = read_IRQreg_idr(opp, idx);
+		break;
+	case 0x18:
+		retval = read_IRQreg_ilr(opp, idx);
+		break;
+	}
+
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx = opp->irq_msi;
+	int srs, ibs;
+
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	switch (addr) {
+	case MSIIR_OFFSET:
+		srs = val >> MSIIR_SRS_SHIFT;
+		idx += srs;
+		ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT;
+		opp->msi[srs].msir |= 1 << ibs;
+		openpic_set_irq(opp, idx, 1);
+		break;
+	default:
+		/* most registers are read-only, thus ignored */
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t r = 0;
+	int i, srs;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	if (addr & 0xF)
+		return -ENXIO;
+
+	srs = addr >> 4;
+
+	switch (addr) {
+	case 0x00:
+	case 0x10:
+	case 0x20:
+	case 0x30:
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:		/* MSIRs */
+		r = opp->msi[srs].msir;
+		/* Clear on read */
+		opp->msi[srs].msir = 0;
+		openpic_set_irq(opp, opp->irq_msi + srs, 0);
+		break;
+	case 0x120:		/* MSISR */
+		for (i = 0; i < MAX_MSI; i++)
+			r |= (opp->msi[i].msir ? 1 : 0) << i;
+		break;
+	}
+
+	pr_debug("%s: => 0x%08x\n", __func__, r);
+	*ptr = r;
+	return 0;
+}
+
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	uint32_t r = 0;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+
+	/* TODO: EISR/EIMR */
+
+	*ptr = r;
+	return 0;
+}
+
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
+{
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+
+	/* TODO: EISR/EIMR */
+	return 0;
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx)
+{
+	struct openpic *opp = opaque;
+	struct irq_source *src;
+	struct irq_dest *dst;
+	int s_IRQ, n_IRQ;
+
+	pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
+		addr, val);
+
+	if (idx < 0)
+		return 0;
+
+	if (addr & 0xF)
+		return 0;
+
+	dst = &opp->dst[idx];
+	addr &= 0xFF0;
+	switch (addr) {
+	case 0x40:		/* IPIDR */
+	case 0x50:
+	case 0x60:
+	case 0x70:
+		idx = (addr - 0x40) >> 4;
+		/* we use IDE as mask which CPUs to deliver the IPI to still. */
+		opp->src[opp->irq_ipi0 + idx].destmask |= val;
+		openpic_set_irq(opp, opp->irq_ipi0 + idx, 1);
+		openpic_set_irq(opp, opp->irq_ipi0 + idx, 0);
+		break;
+	case 0x80:		/* CTPR */
+		dst->ctpr = val & 0x0000000F;
+
+		pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n",
+			__func__, idx, dst->ctpr, dst->raised.priority,
+			dst->servicing.priority);
+
+		if (dst->raised.priority <= dst->ctpr) {
+			pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
+				__func__, idx);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+		} else if (dst->raised.priority > dst->servicing.priority) {
+			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
+				__func__, idx, dst->raised.next);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+
+		break;
+	case 0x90:		/* WHOAMI */
+		/* Read-only register */
+		break;
+	case 0xA0:		/* IACK */
+		/* Read-only register */
+		break;
+	case 0xB0: {		/* EOI */
+		int notify_eoi;
+
+		pr_debug("EOI\n");
+		s_IRQ = IRQ_get_next(opp, &dst->servicing);
+
+		if (s_IRQ < 0) {
+			pr_debug("%s: EOI with no interrupt in service\n",
+				__func__);
+			break;
+		}
+
+		IRQ_resetbit(&dst->servicing, s_IRQ);
+		/* Notify listeners that the IRQ is over */
+		notify_eoi = s_IRQ;
+		/* Set up next servicing IRQ */
+		s_IRQ = IRQ_get_next(opp, &dst->servicing);
+		/* Check queued interrupts. */
+		n_IRQ = IRQ_get_next(opp, &dst->raised);
+		src = &opp->src[n_IRQ];
+		if (n_IRQ != -1 &&
+		    (s_IRQ == -1 ||
+		     IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
+			pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
+				idx, n_IRQ);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+
+		spin_unlock(&opp->lock);
+		kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
+		spin_lock(&opp->lock);
+
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_write_internal(opp, addr, val,
+					 (addr & 0x1f000) >> 12);
+}
+
+static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
+			     int cpu)
+{
+	struct irq_source *src;
+	int retval, irq;
+
+	pr_debug("Lower OpenPIC INT output\n");
+	mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+
+	irq = IRQ_get_next(opp, &dst->raised);
+	pr_debug("IACK: irq=%d\n", irq);
+
+	if (irq == -1)
+		/* No more interrupt pending */
+		return opp->spve;
+
+	src = &opp->src[irq];
+	if (!(src->ivpr & IVPR_ACTIVITY_MASK) ||
+	    !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) {
+		pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n",
+			__func__, irq, dst->ctpr, src->ivpr);
+		openpic_update_irq(opp, irq);
+		retval = opp->spve;
+	} else {
+		/* IRQ enter servicing state */
+		IRQ_setbit(&dst->servicing, irq);
+		retval = IVPR_VECTOR(opp, src->ivpr);
+	}
+
+	if (!src->level) {
+		/* edge-sensitive IRQ */
+		src->ivpr &= ~IVPR_ACTIVITY_MASK;
+		src->pending = 0;
+		IRQ_resetbit(&dst->raised, irq);
+	}
+
+	if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) {
+		src->destmask &= ~(1 << cpu);
+		if (src->destmask && !src->level) {
+			/* trigger on CPUs that didn't know about it yet */
+			openpic_set_irq(opp, irq, 1);
+			openpic_set_irq(opp, irq, 0);
+			/* if all CPUs knew about it, set active bit again */
+			src->ivpr |= IVPR_ACTIVITY_MASK;
+		}
+	}
+
+	return retval;
+}
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+	struct openpic *opp = vcpu->arch.mpic;
+	int cpu = vcpu->arch.irq_cpu_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
+		kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
+
+	spin_unlock_irqrestore(&opp->lock, flags);
+}
+
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx)
+{
+	struct openpic *opp = opaque;
+	struct irq_dest *dst;
+	uint32_t retval;
+
+	pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
+	retval = 0xFFFFFFFF;
+
+	if (idx < 0)
+		goto out;
+
+	if (addr & 0xF)
+		goto out;
+
+	dst = &opp->dst[idx];
+	addr &= 0xFF0;
+	switch (addr) {
+	case 0x80:		/* CTPR */
+		retval = dst->ctpr;
+		break;
+	case 0x90:		/* WHOAMI */
+		retval = idx;
+		break;
+	case 0xA0:		/* IACK */
+		retval = openpic_iack(opp, dst, idx);
+		break;
+	case 0xB0:		/* EOI */
+		retval = 0;
+		break;
+	default:
+		break;
+	}
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+
+out:
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_read_internal(opp, addr, ptr,
+					 (addr & 0x1f000) >> 12);
+}
+
+struct mem_reg {
+	int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+	int (*write)(void *opaque, gpa_t addr, u32 val);
+	gpa_t start_addr;
+	int size;
+};
+
+static const struct mem_reg openpic_gbl_mmio = {
+	.write = openpic_gbl_write,
+	.read = openpic_gbl_read,
+	.start_addr = OPENPIC_GLB_REG_START,
+	.size = OPENPIC_GLB_REG_SIZE,
+};
+
+static const struct mem_reg openpic_tmr_mmio = {
+	.write = openpic_tmr_write,
+	.read = openpic_tmr_read,
+	.start_addr = OPENPIC_TMR_REG_START,
+	.size = OPENPIC_TMR_REG_SIZE,
+};
+
+static const struct mem_reg openpic_cpu_mmio = {
+	.write = openpic_cpu_write,
+	.read = openpic_cpu_read,
+	.start_addr = OPENPIC_CPU_REG_START,
+	.size = OPENPIC_CPU_REG_SIZE,
+};
+
+static const struct mem_reg openpic_src_mmio = {
+	.write = openpic_src_write,
+	.read = openpic_src_read,
+	.start_addr = OPENPIC_SRC_REG_START,
+	.size = OPENPIC_SRC_REG_SIZE,
+};
+
+static const struct mem_reg openpic_msi_mmio = {
+	.read = openpic_msi_read,
+	.write = openpic_msi_write,
+	.start_addr = OPENPIC_MSI_REG_START,
+	.size = OPENPIC_MSI_REG_SIZE,
+};
+
+static const struct mem_reg openpic_summary_mmio = {
+	.read = openpic_summary_read,
+	.write = openpic_summary_write,
+	.start_addr = OPENPIC_SUMMARY_REG_START,
+	.size = OPENPIC_SUMMARY_REG_SIZE,
+};
+
+static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr)
+{
+	if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) {
+		WARN(1, "kvm mpic: too many mmio regions\n");
+		return;
+	}
+
+	opp->mmio_regions[opp->num_mmio_regions++] = mr;
+}
+
+static void fsl_common_init(struct openpic *opp)
+{
+	int i;
+	int virq = MAX_SRC;
+
+	add_mmio_region(opp, &openpic_msi_mmio);
+	add_mmio_region(opp, &openpic_summary_mmio);
+
+	opp->vid = VID_REVISION_1_2;
+	opp->vir = VIR_GENERIC;
+	opp->vector_mask = 0xFFFF;
+	opp->tfrr_reset = 0;
+	opp->ivpr_reset = IVPR_MASK_MASK;
+	opp->idr_reset = 1 << 0;
+	opp->max_irq = MAX_IRQ;
+
+	opp->irq_ipi0 = virq;
+	virq += MAX_IPI;
+	opp->irq_tim0 = virq;
+	virq += MAX_TMR;
+
+	BUG_ON(virq > MAX_IRQ);
+
+	opp->irq_msi = 224;
+
+	for (i = 0; i < opp->fsl->max_ext; i++)
+		opp->src[i].level = false;
+
+	/* Internal interrupts, including message and MSI */
+	for (i = 16; i < MAX_SRC; i++) {
+		opp->src[i].type = IRQ_TYPE_FSLINT;
+		opp->src[i].level = true;
+	}
+
+	/* timers and IPIs */
+	for (i = MAX_SRC; i < virq; i++) {
+		opp->src[i].type = IRQ_TYPE_FSLSPECIAL;
+		opp->src[i].level = false;
+	}
+}
+
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
+{
+	int i;
+
+	for (i = 0; i < opp->num_mmio_regions; i++) {
+		const struct mem_reg *mr = opp->mmio_regions[i];
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->read(opp, addr - mr->start_addr, ptr);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
+{
+	int i;
+
+	for (i = 0; i < opp->num_mmio_regions; i++) {
+		const struct mem_reg *mr = opp->mmio_regions[i];
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->write(opp, addr - mr->start_addr, val);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_read(struct kvm_vcpu *vcpu,
+			 struct kvm_io_device *this,
+			 gpa_t addr, int len, void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+	union {
+		u32 val;
+		u8 bytes[4];
+	} u;
+
+	if (addr & (len - 1)) {
+		pr_debug("%s: bad alignment %llx/%d\n",
+			 __func__, addr, len);
+		return -EINVAL;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+	spin_unlock_irq(&opp->lock);
+
+	/*
+	 * Technically only 32-bit accesses are allowed, but be nice to
+	 * people dumping registers a byte at a time -- it works in real
+	 * hardware (reads only, not writes).
+	 */
+	if (len == 4) {
+		*(u32 *)ptr = u.val;
+		pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+			 __func__, addr, ret, u.val);
+	} else if (len == 1) {
+		*(u8 *)ptr = u.bytes[addr & 3];
+		pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+			 __func__, addr, ret, u.bytes[addr & 3]);
+	} else {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int kvm_mpic_write(struct kvm_vcpu *vcpu,
+			  struct kvm_io_device *this,
+			  gpa_t addr, int len, const void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+
+	if (len != 4) {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EOPNOTSUPP;
+	}
+	if (addr & 3) {
+		pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+				      *(const u32 *)ptr);
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: addr %llx ret %d val %x\n",
+		 __func__, addr, ret, *(const u32 *)ptr);
+
+	return ret;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+	.read = kvm_mpic_read,
+	.write = kvm_mpic_write,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+	kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+	kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+				opp->reg_base, OPENPIC_REG_SIZE,
+				&opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+	kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u64 base;
+
+	if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
+		return -EFAULT;
+
+	if (base & 0x3ffff) {
+		pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+			 __func__, base);
+		return -EINVAL;
+	}
+
+	if (base == opp->reg_base)
+		return 0;
+
+	mutex_lock(&opp->kvm->slots_lock);
+
+	unmap_mmio(opp);
+	opp->reg_base = base;
+
+	pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+		 __func__, base);
+
+	if (base == 0)
+		goto out;
+
+	map_mmio(opp);
+
+out:
+	mutex_unlock(&opp->kvm->slots_lock);
+	return 0;
+}
+
+#define ATTR_SET		0
+#define ATTR_GET		1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+	int ret;
+
+	if (addr & 3)
+		return -ENXIO;
+
+	spin_lock_irq(&opp->lock);
+
+	if (type == ATTR_SET)
+		ret = kvm_mpic_write_internal(opp, addr, *val);
+	else
+		ret = kvm_mpic_read_internal(opp, addr, val);
+
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+	return ret;
+}
+
+static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u32 attr32;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return set_base_addr(opp, attr);
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		if (attr32 != 0 && attr32 != 1)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		openpic_set_irq(opp, attr->attr, attr32);
+		spin_unlock_irq(&opp->lock);
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u64 attr64;
+	u32 attr32;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			mutex_lock(&opp->kvm->slots_lock);
+			attr64 = opp->reg_base;
+			mutex_unlock(&opp->kvm->slots_lock);
+
+			if (copy_to_user((u64 __user *)(long)attr->addr,
+					 &attr64, sizeof(u64)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+		if (ret)
+			return ret;
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		attr32 = opp->src[attr->attr].pending;
+		spin_unlock_irq(&opp->lock);
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			break;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static void mpic_destroy(struct kvm_device *dev)
+{
+	struct openpic *opp = dev->private;
+
+	dev->kvm->arch.mpic = NULL;
+	kfree(opp);
+	kfree(dev);
+}
+
+static int mpic_set_default_irq_routing(struct openpic *opp)
+{
+	struct kvm_irq_routing_entry *routing;
+
+	/* Create a nop default map, so that dereferencing it still works */
+	routing = kzalloc((sizeof(*routing)), GFP_KERNEL);
+	if (!routing)
+		return -ENOMEM;
+
+	kvm_set_irq_routing(opp->kvm, routing, 0, 0);
+
+	kfree(routing);
+	return 0;
+}
+
+static int mpic_create(struct kvm_device *dev, u32 type)
+{
+	struct openpic *opp;
+	int ret;
+
+	/* We only support one MPIC at a time for now */
+	if (dev->kvm->arch.mpic)
+		return -EINVAL;
+
+	opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+	if (!opp)
+		return -ENOMEM;
+
+	dev->private = opp;
+	opp->kvm = dev->kvm;
+	opp->dev = dev;
+	opp->model = type;
+	spin_lock_init(&opp->lock);
+
+	add_mmio_region(opp, &openpic_gbl_mmio);
+	add_mmio_region(opp, &openpic_tmr_mmio);
+	add_mmio_region(opp, &openpic_src_mmio);
+	add_mmio_region(opp, &openpic_cpu_mmio);
+
+	switch (opp->model) {
+	case KVM_DEV_TYPE_FSL_MPIC_20:
+		opp->fsl = &fsl_mpic_20;
+		opp->brr1 = 0x00400200;
+		opp->flags |= OPENPIC_FLAG_IDR_CRIT;
+		opp->nb_irqs = 80;
+		opp->mpic_mode_mask = GCR_MODE_MIXED;
+
+		fsl_common_init(opp);
+
+		break;
+
+	case KVM_DEV_TYPE_FSL_MPIC_42:
+		opp->fsl = &fsl_mpic_42;
+		opp->brr1 = 0x00400402;
+		opp->flags |= OPENPIC_FLAG_ILR;
+		opp->nb_irqs = 196;
+		opp->mpic_mode_mask = GCR_MODE_PROXY;
+
+		fsl_common_init(opp);
+
+		break;
+
+	default:
+		ret = -ENODEV;
+		goto err;
+	}
+
+	ret = mpic_set_default_irq_routing(opp);
+	if (ret)
+		goto err;
+
+	openpic_reset(opp);
+
+	smp_wmb();
+	dev->kvm->arch.mpic = opp;
+
+	return 0;
+
+err:
+	kfree(opp);
+	return ret;
+}
+
+struct kvm_device_ops kvm_mpic_ops = {
+	.name = "kvm-mpic",
+	.create = mpic_create,
+	.destroy = mpic_destroy,
+	.set_attr = mpic_set_attr,
+	.get_attr = mpic_get_attr,
+	.has_attr = mpic_has_attr,
+};
+
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 cpu)
+{
+	struct openpic *opp = dev->private;
+	int ret = 0;
+
+	if (dev->ops != &kvm_mpic_ops)
+		return -EPERM;
+	if (opp->kvm != vcpu->kvm)
+		return -EPERM;
+	if (cpu < 0 || cpu >= MAX_CPU)
+		return -EPERM;
+
+	spin_lock_irq(&opp->lock);
+
+	if (opp->dst[cpu].vcpu) {
+		ret = -EEXIST;
+		goto out;
+	}
+	if (vcpu->arch.irq_type) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	opp->dst[cpu].vcpu = vcpu;
+	opp->nb_cpus = max(opp->nb_cpus, cpu + 1);
+
+	vcpu->arch.mpic = opp;
+	vcpu->arch.irq_cpu_id = cpu;
+	vcpu->arch.irq_type = KVMPPC_IRQ_MPIC;
+
+	/* This might need to be changed if GCR gets extended */
+	if (opp->mpic_mode_mask == GCR_MODE_PROXY)
+		vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
+
+out:
+	spin_unlock_irq(&opp->lock);
+	return ret;
+}
+
+/*
+ * This should only happen immediately before the mpic is destroyed,
+ * so we shouldn't need to worry about anything still trying to
+ * access the vcpu pointer.
+ */
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
+{
+	BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
+
+	opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
+}
+
+/*
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
+			struct kvm *kvm, int irq_source_id, int level,
+			bool line_status)
+{
+	u32 irq = e->irqchip.pin;
+	struct openpic *opp = kvm->arch.mpic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+	openpic_set_irq(opp, irq, level);
+	spin_unlock_irqrestore(&opp->lock, flags);
+
+	/* All code paths we care about don't check for the return value */
+	return 0;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+	struct openpic *opp = kvm->arch.mpic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	/*
+	 * XXX We ignore the target address for now, as we only support
+	 *     a single MSI bank.
+	 */
+	openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
+	spin_unlock_irqrestore(&opp->lock, flags);
+
+	/* All code paths we care about don't check for the return value */
+	return 0;
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+			  struct kvm_kernel_irq_routing_entry *e,
+			  const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		e->set = mpic_set_irq;
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin;
+		if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS)
+			goto out;
+		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		break;
+	default:
+		goto out;
+	}
+
+	r = 0;
+out:
+	return r;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
new file mode 100644
index 0000000000..7197c82566
--- /dev/null
+++ b/arch/powerpc/kvm/powerpc.c
@@ -0,0 +1,2552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/vmalloc.h>
+#include <linux/hrtimer.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/of.h>
+#include <asm/cputable.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/cputhreads.h>
+#include <asm/irqflags.h>
+#include <asm/iommu.h>
+#include <asm/switch_to.h>
+#include <asm/xive.h>
+#ifdef CONFIG_PPC_PSERIES
+#include <asm/hvcall.h>
+#include <asm/plpar_wrappers.h>
+#endif
+#include <asm/ultravisor.h>
+#include <asm/setup.h>
+
+#include "timing.h"
+#include "../mm/mmu_decl.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+struct kvmppc_ops *kvmppc_hv_ops;
+EXPORT_SYMBOL_GPL(kvmppc_hv_ops);
+struct kvmppc_ops *kvmppc_pr_ops;
+EXPORT_SYMBOL_GPL(kvmppc_pr_ops);
+
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+	return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_vcpu_runnable(vcpu);
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
+/*
+ * Common checks before entering the guest world.  Call with interrupts
+ * disabled.
+ *
+ * returns:
+ *
+ * == 1 if we're ready to go into guest state
+ * <= 0 if we need to go back to the host with return value
+ */
+int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	WARN_ON(irqs_disabled());
+	hard_irq_disable();
+
+	while (true) {
+		if (need_resched()) {
+			local_irq_enable();
+			cond_resched();
+			hard_irq_disable();
+			continue;
+		}
+
+		if (signal_pending(current)) {
+			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+			break;
+		}
+
+		vcpu->mode = IN_GUEST_MODE;
+
+		/*
+		 * Reading vcpu->requests must happen after setting vcpu->mode,
+		 * so we don't miss a request because the requester sees
+		 * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
+		 * before next entering the guest (and thus doesn't IPI).
+		 * This also orders the write to mode from any reads
+		 * to the page tables done while the VCPU is running.
+		 * Please see the comment in kvm_flush_remote_tlbs.
+		 */
+		smp_mb();
+
+		if (kvm_request_pending(vcpu)) {
+			/* Make sure we process requests preemptable */
+			local_irq_enable();
+			trace_kvm_check_requests(vcpu);
+			r = kvmppc_core_check_requests(vcpu);
+			hard_irq_disable();
+			if (r > 0)
+				continue;
+			break;
+		}
+
+		if (kvmppc_core_prepare_to_enter(vcpu)) {
+			/* interrupts got enabled in between, so we
+			   are back at square 1 */
+			continue;
+		}
+
+		guest_enter_irqoff();
+		return 1;
+	}
+
+	/* return to host */
+	local_irq_enable();
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter);
+
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+static void kvmppc_swab_shared(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
+	int i;
+
+	shared->sprg0 = swab64(shared->sprg0);
+	shared->sprg1 = swab64(shared->sprg1);
+	shared->sprg2 = swab64(shared->sprg2);
+	shared->sprg3 = swab64(shared->sprg3);
+	shared->srr0 = swab64(shared->srr0);
+	shared->srr1 = swab64(shared->srr1);
+	shared->dar = swab64(shared->dar);
+	shared->msr = swab64(shared->msr);
+	shared->dsisr = swab32(shared->dsisr);
+	shared->int_pending = swab32(shared->int_pending);
+	for (i = 0; i < ARRAY_SIZE(shared->sr); i++)
+		shared->sr[i] = swab32(shared->sr[i]);
+}
+#endif
+
+int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
+{
+	int nr = kvmppc_get_gpr(vcpu, 11);
+	int r;
+	unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3);
+	unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4);
+	unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5);
+	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
+	unsigned long r2 = 0;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
+		/* 32 bit mode */
+		param1 &= 0xffffffff;
+		param2 &= 0xffffffff;
+		param3 &= 0xffffffff;
+		param4 &= 0xffffffff;
+	}
+
+	switch (nr) {
+	case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
+	{
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+		/* Book3S can be little endian, find it out here */
+		int shared_big_endian = true;
+		if (vcpu->arch.intr_msr & MSR_LE)
+			shared_big_endian = false;
+		if (shared_big_endian != vcpu->arch.shared_big_endian)
+			kvmppc_swab_shared(vcpu);
+		vcpu->arch.shared_big_endian = shared_big_endian;
+#endif
+
+		if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) {
+			/*
+			 * Older versions of the Linux magic page code had
+			 * a bug where they would map their trampoline code
+			 * NX. If that's the case, remove !PR NX capability.
+			 */
+			vcpu->arch.disable_kernel_nx = true;
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		}
+
+		vcpu->arch.magic_page_pa = param1 & ~0xfffULL;
+		vcpu->arch.magic_page_ea = param2 & ~0xfffULL;
+
+#ifdef CONFIG_PPC_64K_PAGES
+		/*
+		 * Make sure our 4k magic page is in the same window of a 64k
+		 * page within the guest and within the host's page.
+		 */
+		if ((vcpu->arch.magic_page_pa & 0xf000) !=
+		    ((ulong)vcpu->arch.shared & 0xf000)) {
+			void *old_shared = vcpu->arch.shared;
+			ulong shared = (ulong)vcpu->arch.shared;
+			void *new_shared;
+
+			shared &= PAGE_MASK;
+			shared |= vcpu->arch.magic_page_pa & 0xf000;
+			new_shared = (void*)shared;
+			memcpy(new_shared, old_shared, 0x1000);
+			vcpu->arch.shared = new_shared;
+		}
+#endif
+
+		r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
+
+		r = EV_SUCCESS;
+		break;
+	}
+	case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
+		r = EV_SUCCESS;
+#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
+		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
+#endif
+
+		/* Second return value is in r4 */
+		break;
+	case EV_HCALL_TOKEN(EV_IDLE):
+		r = EV_SUCCESS;
+		kvm_vcpu_halt(vcpu);
+		break;
+	default:
+		r = EV_UNIMPLEMENTED;
+		break;
+	}
+
+	kvmppc_set_gpr(vcpu, 4, r2);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_kvm_pv);
+
+int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
+{
+	int r = false;
+
+	/* We have to know what CPU to virtualize */
+	if (!vcpu->arch.pvr)
+		goto out;
+
+	/* PAPR only works with book3s_64 */
+	if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
+		goto out;
+
+	/* HV KVM can only do PAPR mode for now */
+	if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm))
+		goto out;
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	if (!cpu_has_feature(CPU_FTR_EMB_HV))
+		goto out;
+#endif
+
+	r = true;
+
+out:
+	vcpu->arch.sane = r;
+	return r ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_sanity_check);
+
+int kvmppc_emulate_mmio(struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	int r;
+
+	er = kvmppc_emulate_loadstore(vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		/* Future optimization: only reload non-volatiles if they were
+		 * actually modified. */
+		r = RESUME_GUEST_NV;
+		break;
+	case EMULATE_AGAIN:
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_DO_MMIO:
+		vcpu->run->exit_reason = KVM_EXIT_MMIO;
+		/* We must reload nonvolatiles because "update" load/store
+		 * instructions modify register state. */
+		/* Future optimization: only reload non-volatiles if they were
+		 * actually modified. */
+		r = RESUME_HOST_NV;
+		break;
+	case EMULATE_FAIL:
+	{
+		ppc_inst_t last_inst;
+
+		kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+		kvm_debug_ratelimited("Guest access to device memory using unsupported instruction (opcode: %#08x)\n",
+				      ppc_inst_val(last_inst));
+
+		/*
+		 * Injecting a Data Storage here is a bit more
+		 * accurate since the instruction that caused the
+		 * access could still be a valid one.
+		 */
+		if (!IS_ENABLED(CONFIG_BOOKE)) {
+			ulong dsisr = DSISR_BADACCESS;
+
+			if (vcpu->mmio_is_write)
+				dsisr |= DSISR_ISSTORE;
+
+			kvmppc_core_queue_data_storage(vcpu,
+					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
+					vcpu->arch.vaddr_accessed, dsisr);
+		} else {
+			/*
+			 * BookE does not send a SIGBUS on a bad
+			 * fault, so use a Program interrupt instead
+			 * to avoid a fault loop.
+			 */
+			kvmppc_core_queue_program(vcpu, 0);
+		}
+
+		r = RESUME_GUEST;
+		break;
+	}
+	default:
+		WARN_ON(1);
+		r = RESUME_GUEST;
+	}
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio);
+
+int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
+	      bool data)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
+	struct kvmppc_pte pte;
+	int r = -EINVAL;
+
+	vcpu->stat.st++;
+
+	if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr)
+		r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr,
+							    size);
+
+	if ((!r) || (r == -EAGAIN))
+		return r;
+
+	r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
+			 XLATE_WRITE, &pte);
+	if (r < 0)
+		return r;
+
+	*eaddr = pte.raddr;
+
+	if (!pte.may_write)
+		return -EPERM;
+
+	/* Magic page override */
+	if (kvmppc_supports_magic_page(vcpu) && mp_pa &&
+	    ((pte.raddr & KVM_PAM & PAGE_MASK) == mp_pa) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		void *magic = vcpu->arch.shared;
+		magic += pte.eaddr & 0xfff;
+		memcpy(magic, ptr, size);
+		return EMULATE_DONE;
+	}
+
+	if (kvm_write_guest(vcpu->kvm, pte.raddr, ptr, size))
+		return EMULATE_DO_MMIO;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvmppc_st);
+
+int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
+		      bool data)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
+	struct kvmppc_pte pte;
+	int rc = -EINVAL;
+
+	vcpu->stat.ld++;
+
+	if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr)
+		rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr,
+							      size);
+
+	if ((!rc) || (rc == -EAGAIN))
+		return rc;
+
+	rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
+			  XLATE_READ, &pte);
+	if (rc)
+		return rc;
+
+	*eaddr = pte.raddr;
+
+	if (!pte.may_read)
+		return -EPERM;
+
+	if (!data && !pte.may_execute)
+		return -ENOEXEC;
+
+	/* Magic page override */
+	if (kvmppc_supports_magic_page(vcpu) && mp_pa &&
+	    ((pte.raddr & KVM_PAM & PAGE_MASK) == mp_pa) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		void *magic = vcpu->arch.shared;
+		magic += pte.eaddr & 0xfff;
+		memcpy(ptr, magic, size);
+		return EMULATE_DONE;
+	}
+
+	kvm_vcpu_srcu_read_lock(vcpu);
+	rc = kvm_read_guest(vcpu->kvm, pte.raddr, ptr, size);
+	kvm_vcpu_srcu_read_unlock(vcpu);
+	if (rc)
+		return EMULATE_DO_MMIO;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ld);
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+	struct kvmppc_ops *kvm_ops = NULL;
+	int r;
+
+	/*
+	 * if we have both HV and PR enabled, default is HV
+	 */
+	if (type == 0) {
+		if (kvmppc_hv_ops)
+			kvm_ops = kvmppc_hv_ops;
+		else
+			kvm_ops = kvmppc_pr_ops;
+		if (!kvm_ops)
+			goto err_out;
+	} else	if (type == KVM_VM_PPC_HV) {
+		if (!kvmppc_hv_ops)
+			goto err_out;
+		kvm_ops = kvmppc_hv_ops;
+	} else if (type == KVM_VM_PPC_PR) {
+		if (!kvmppc_pr_ops)
+			goto err_out;
+		kvm_ops = kvmppc_pr_ops;
+	} else
+		goto err_out;
+
+	if (!try_module_get(kvm_ops->owner))
+		return -ENOENT;
+
+	kvm->arch.kvm_ops = kvm_ops;
+	r = kvmppc_core_init_vm(kvm);
+	if (r)
+		module_put(kvm_ops->owner);
+	return r;
+err_out:
+	return -EINVAL;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_XICS
+	/*
+	 * We call kick_all_cpus_sync() to ensure that all
+	 * CPUs have executed any pending IPIs before we
+	 * continue and free VCPUs structures below.
+	 */
+	if (is_kvmppc_hv_enabled(kvm))
+		kick_all_cpus_sync();
+#endif
+
+	kvm_destroy_vcpus(kvm);
+
+	mutex_lock(&kvm->lock);
+
+	kvmppc_core_destroy_vm(kvm);
+
+	mutex_unlock(&kvm->lock);
+
+	/* drop the module reference */
+	module_put(kvm->arch.kvm_ops->owner);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+	/* Assume we're using HV mode when the HV module is loaded */
+	int hv_enabled = kvmppc_hv_ops ? 1 : 0;
+
+	if (kvm) {
+		/*
+		 * Hooray - we know which VM type we're running on. Depend on
+		 * that rather than the guess above.
+		 */
+		hv_enabled = is_kvmppc_hv_enabled(kvm);
+	}
+
+	switch (ext) {
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_SREGS:
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+	case KVM_CAP_PPC_EPR:
+#else
+	case KVM_CAP_PPC_SEGSTATE:
+	case KVM_CAP_PPC_HIOR:
+	case KVM_CAP_PPC_PAPR:
+#endif
+	case KVM_CAP_PPC_UNSET_IRQ:
+	case KVM_CAP_PPC_IRQ_LEVEL:
+	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_ONE_REG:
+	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
+	case KVM_CAP_IMMEDIATE_EXIT:
+	case KVM_CAP_SET_GUEST_DEBUG:
+		r = 1;
+		break;
+	case KVM_CAP_PPC_GUEST_DEBUG_SSTEP:
+	case KVM_CAP_PPC_PAIRED_SINGLES:
+	case KVM_CAP_PPC_OSI:
+	case KVM_CAP_PPC_GET_PVINFO:
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	case KVM_CAP_SW_TLB:
+#endif
+		/* We support this only for PR */
+		r = !hv_enabled;
+		break;
+#ifdef CONFIG_KVM_MPIC
+	case KVM_CAP_IRQ_MPIC:
+		r = 1;
+		break;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CAP_SPAPR_TCE:
+	case KVM_CAP_SPAPR_TCE_64:
+		r = 1;
+		break;
+	case KVM_CAP_SPAPR_TCE_VFIO:
+		r = !!cpu_has_feature(CPU_FTR_HVMODE);
+		break;
+	case KVM_CAP_PPC_RTAS:
+	case KVM_CAP_PPC_FIXUP_HCALL:
+	case KVM_CAP_PPC_ENABLE_HCALL:
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS:
+#endif
+	case KVM_CAP_PPC_GET_CPU_CHAR:
+		r = 1;
+		break;
+#ifdef CONFIG_KVM_XIVE
+	case KVM_CAP_PPC_IRQ_XIVE:
+		/*
+		 * We need XIVE to be enabled on the platform (implies
+		 * a POWER9 processor) and the PowerNV platform, as
+		 * nested is not yet supported.
+		 */
+		r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) &&
+			kvmppc_xive_native_supported();
+		break;
+#endif
+
+#ifdef CONFIG_HAVE_KVM_IRQFD
+	case KVM_CAP_IRQFD_RESAMPLE:
+		r = !xive_enabled();
+		break;
+#endif
+
+	case KVM_CAP_PPC_ALLOC_HTAB:
+		r = hv_enabled;
+		break;
+#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_SMT:
+		r = 0;
+		if (kvm) {
+			if (kvm->arch.emul_smt_mode > 1)
+				r = kvm->arch.emul_smt_mode;
+			else
+				r = kvm->arch.smt_mode;
+		} else if (hv_enabled) {
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				r = 1;
+			else
+				r = threads_per_subcore;
+		}
+		break;
+	case KVM_CAP_PPC_SMT_POSSIBLE:
+		r = 1;
+		if (hv_enabled) {
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				r = ((threads_per_subcore << 1) - 1);
+			else
+				/* P9 can emulate dbells, so allow any mode */
+				r = 8 | 4 | 2 | 1;
+		}
+		break;
+	case KVM_CAP_PPC_RMA:
+		r = 0;
+		break;
+	case KVM_CAP_PPC_HWRNG:
+		r = kvmppc_hwrng_present();
+		break;
+	case KVM_CAP_PPC_MMU_RADIX:
+		r = !!(hv_enabled && radix_enabled());
+		break;
+	case KVM_CAP_PPC_MMU_HASH_V3:
+		r = !!(hv_enabled && kvmppc_hv_ops->hash_v3_possible &&
+		       kvmppc_hv_ops->hash_v3_possible());
+		break;
+	case KVM_CAP_PPC_NESTED_HV:
+		r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
+		       !kvmppc_hv_ops->enable_nested(NULL));
+		break;
+#endif
+	case KVM_CAP_SYNC_MMU:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		r = hv_enabled;
+#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+		r = 1;
+#else
+		r = 0;
+#endif
+		break;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_HTAB_FD:
+		r = hv_enabled;
+		break;
+#endif
+	case KVM_CAP_NR_VCPUS:
+		/*
+		 * Recommending a number of CPUs is somewhat arbitrary; we
+		 * return the number of present CPUs for -HV (since a host
+		 * will have secondary threads "offline"), and for other KVM
+		 * implementations just count online CPUs.
+		 */
+		if (hv_enabled)
+			r = min_t(unsigned int, num_present_cpus(), KVM_MAX_VCPUS);
+		else
+			r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
+		break;
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = KVM_MAX_VCPU_IDS;
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CAP_PPC_GET_SMMU_INFO:
+		r = 1;
+		break;
+	case KVM_CAP_SPAPR_MULTITCE:
+		r = 1;
+		break;
+	case KVM_CAP_SPAPR_RESIZE_HPT:
+		r = !!hv_enabled;
+		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_FWNMI:
+		r = hv_enabled;
+		break;
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_CAP_PPC_HTM:
+		r = !!(cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM) ||
+		     (hv_enabled && cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST));
+		break;
+#endif
+#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+	case KVM_CAP_PPC_SECURE_GUEST:
+		r = hv_enabled && kvmppc_hv_ops->enable_svm &&
+			!kvmppc_hv_ops->enable_svm(NULL);
+		break;
+	case KVM_CAP_PPC_DAWR1:
+		r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 &&
+		       !kvmppc_hv_ops->enable_dawr1(NULL));
+		break;
+	case KVM_CAP_PPC_RPT_INVALIDATE:
+		r = 1;
+		break;
+#endif
+	case KVM_CAP_PPC_AIL_MODE_3:
+		r = 0;
+		/*
+		 * KVM PR, POWER7, and some POWER9s don't support AIL=3 mode.
+		 * The POWER9s can support it if the guest runs in hash mode,
+		 * but QEMU doesn't necessarily query the capability in time.
+		 */
+		if (hv_enabled) {
+			if (kvmhv_on_pseries()) {
+				if (pseries_reloc_on_exception())
+					r = 1;
+			} else if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+				  !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
+				r = 1;
+			}
+		}
+		break;
+	default:
+		r = 0;
+		break;
+	}
+	return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+	return -EINVAL;
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+	kvmppc_core_free_memslot(kvm, slot);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+				   const struct kvm_memory_slot *old,
+				   struct kvm_memory_slot *new,
+				   enum kvm_mr_change change)
+{
+	return kvmppc_core_prepare_memory_region(kvm, old, new, change);
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+				   struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
+				   enum kvm_mr_change change)
+{
+	kvmppc_core_commit_memory_region(kvm, old, new, change);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	kvmppc_core_flush_memslot(kvm, slot);
+}
+
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+	return 0;
+}
+
+static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer);
+	kvmppc_decrementer_func(vcpu);
+
+	return HRTIMER_NORESTART;
+}
+
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+	int err;
+
+	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	mutex_init(&vcpu->arch.exit_timing_lock);
+#endif
+	err = kvmppc_subarch_vcpu_init(vcpu);
+	if (err)
+		return err;
+
+	err = kvmppc_core_vcpu_create(vcpu);
+	if (err)
+		goto out_vcpu_uninit;
+
+	rcuwait_init(&vcpu->arch.wait);
+	vcpu->arch.waitp = &vcpu->arch.wait;
+	return 0;
+
+out_vcpu_uninit:
+	kvmppc_subarch_vcpu_uninit(vcpu);
+	return err;
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	/* Make sure we're not using the vcpu anymore */
+	hrtimer_cancel(&vcpu->arch.dec_timer);
+
+	switch (vcpu->arch.irq_type) {
+	case KVMPPC_IRQ_MPIC:
+		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
+		break;
+	case KVMPPC_IRQ_XICS:
+		if (xics_on_xive())
+			kvmppc_xive_cleanup_vcpu(vcpu);
+		else
+			kvmppc_xics_free_icp(vcpu);
+		break;
+	case KVMPPC_IRQ_XIVE:
+		kvmppc_xive_native_cleanup_vcpu(vcpu);
+		break;
+	}
+
+	kvmppc_core_vcpu_free(vcpu);
+
+	kvmppc_subarch_vcpu_uninit(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return kvmppc_core_pending_dec(vcpu);
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_BOOKE
+	/*
+	 * vrsave (formerly usprg0) isn't used by Linux, but may
+	 * be used by the guest.
+	 *
+	 * On non-booke this is associated with Altivec and
+	 * is handled by code in book3s.c.
+	 */
+	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+#endif
+	kvmppc_core_vcpu_load(vcpu, cpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	kvmppc_core_vcpu_put(vcpu);
+#ifdef CONFIG_BOOKE
+	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
+#endif
+}
+
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+	return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+		(kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				     struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
+
+	if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+		return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+	return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
+
+	if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+		kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_dword_offset(int index)
+{
+	int offset;
+
+	if ((index != 0) && (index != 1))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset =  index;
+#else
+	offset = 1 - index;
+#endif
+
+	return offset;
+}
+
+static inline int kvmppc_get_vsr_word_offset(int index)
+{
+	int offset;
+
+	if ((index > 3) || (index < 0))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset = index;
+#else
+	offset = 3 - index;
+#endif
+	return offset;
+}
+
+static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
+		val.vsxval[offset] = gpr;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
+		val.vsxval[0] = gpr;
+		val.vsxval[1] = gpr;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
+		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_word_dump(struct kvm_vcpu *vcpu,
+	u32 gpr)
+{
+	union kvmppc_one_reg val;
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (index >= 32) {
+		val.vsx32val[0] = gpr;
+		val.vsx32val[1] = gpr;
+		val.vsx32val[2] = gpr;
+		val.vsx32val[3] = gpr;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
+	} else {
+		val.vsx32val[0] = gpr;
+		val.vsx32val[1] = gpr;
+		VCPU_VSX_FPR(vcpu, index, 0) = val.vsxval[0];
+		VCPU_VSX_FPR(vcpu, index, 1) = val.vsxval[0];
+	}
+}
+
+static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
+	u32 gpr32)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+	int dword_offset, word_offset;
+
+	if (offset == -1)
+		return;
+
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
+		val.vsx32val[offset] = gpr32;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
+	} else {
+		dword_offset = offset / 2;
+		word_offset = offset % 2;
+		val.vsxval[0] = VCPU_VSX_FPR(vcpu, index, dword_offset);
+		val.vsx32val[word_offset] = gpr32;
+		VCPU_VSX_FPR(vcpu, index, dword_offset) = val.vsxval[0];
+	}
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_ALTIVEC
+static inline int kvmppc_get_vmx_offset_generic(struct kvm_vcpu *vcpu,
+		int index, int element_size)
+{
+	int offset;
+	int elts = sizeof(vector128)/element_size;
+
+	if ((index < 0) || (index >= elts))
+		return -1;
+
+	if (kvmppc_need_byteswap(vcpu))
+		offset = elts - index - 1;
+	else
+		offset = index;
+
+	return offset;
+}
+
+static inline int kvmppc_get_vmx_dword_offset(struct kvm_vcpu *vcpu,
+		int index)
+{
+	return kvmppc_get_vmx_offset_generic(vcpu, index, 8);
+}
+
+static inline int kvmppc_get_vmx_word_offset(struct kvm_vcpu *vcpu,
+		int index)
+{
+	return kvmppc_get_vmx_offset_generic(vcpu, index, 4);
+}
+
+static inline int kvmppc_get_vmx_hword_offset(struct kvm_vcpu *vcpu,
+		int index)
+{
+	return kvmppc_get_vmx_offset_generic(vcpu, index, 2);
+}
+
+static inline int kvmppc_get_vmx_byte_offset(struct kvm_vcpu *vcpu,
+		int index)
+{
+	return kvmppc_get_vmx_offset_generic(vcpu, index, 1);
+}
+
+
+static inline void kvmppc_set_vmx_dword(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vmx_dword_offset(vcpu,
+			vcpu->arch.mmio_vmx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	val.vval = VCPU_VSX_VR(vcpu, index);
+	val.vsxval[offset] = gpr;
+	VCPU_VSX_VR(vcpu, index) = val.vval;
+}
+
+static inline void kvmppc_set_vmx_word(struct kvm_vcpu *vcpu,
+	u32 gpr32)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vmx_word_offset(vcpu,
+			vcpu->arch.mmio_vmx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	val.vval = VCPU_VSX_VR(vcpu, index);
+	val.vsx32val[offset] = gpr32;
+	VCPU_VSX_VR(vcpu, index) = val.vval;
+}
+
+static inline void kvmppc_set_vmx_hword(struct kvm_vcpu *vcpu,
+	u16 gpr16)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vmx_hword_offset(vcpu,
+			vcpu->arch.mmio_vmx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	val.vval = VCPU_VSX_VR(vcpu, index);
+	val.vsx16val[offset] = gpr16;
+	VCPU_VSX_VR(vcpu, index) = val.vval;
+}
+
+static inline void kvmppc_set_vmx_byte(struct kvm_vcpu *vcpu,
+	u8 gpr8)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vmx_byte_offset(vcpu,
+			vcpu->arch.mmio_vmx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	val.vval = VCPU_VSX_VR(vcpu, index);
+	val.vsx8val[offset] = gpr8;
+	VCPU_VSX_VR(vcpu, index) = val.vval;
+}
+#endif /* CONFIG_ALTIVEC */
+
+#ifdef CONFIG_PPC_FPU
+static inline u64 sp_to_dp(u32 fprs)
+{
+	u64 fprd;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfs%U1%X1 0,%1; stfd%U0%X0 0,%0" : "=m<>" (fprd) : "m<>" (fprs)
+	     : "fr0");
+	preempt_enable();
+	return fprd;
+}
+
+static inline u32 dp_to_sp(u64 fprd)
+{
+	u32 fprs;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfd%U1%X1 0,%1; stfs%U0%X0 0,%0" : "=m<>" (fprs) : "m<>" (fprd)
+	     : "fr0");
+	preempt_enable();
+	return fprs;
+}
+
+#else
+#define sp_to_dp(x)	(x)
+#define dp_to_sp(x)	(x)
+#endif /* CONFIG_PPC_FPU */
+
+static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u64 gpr;
+
+	if (run->mmio.len > sizeof(gpr))
+		return;
+
+	if (!vcpu->arch.mmio_host_swabbed) {
+		switch (run->mmio.len) {
+		case 8: gpr = *(u64 *)run->mmio.data; break;
+		case 4: gpr = *(u32 *)run->mmio.data; break;
+		case 2: gpr = *(u16 *)run->mmio.data; break;
+		case 1: gpr = *(u8 *)run->mmio.data; break;
+		}
+	} else {
+		switch (run->mmio.len) {
+		case 8: gpr = swab64(*(u64 *)run->mmio.data); break;
+		case 4: gpr = swab32(*(u32 *)run->mmio.data); break;
+		case 2: gpr = swab16(*(u16 *)run->mmio.data); break;
+		case 1: gpr = *(u8 *)run->mmio.data; break;
+		}
+	}
+
+	/* conversion between single and double precision */
+	if ((vcpu->arch.mmio_sp64_extend) && (run->mmio.len == 4))
+		gpr = sp_to_dp(gpr);
+
+	if (vcpu->arch.mmio_sign_extend) {
+		switch (run->mmio.len) {
+#ifdef CONFIG_PPC64
+		case 4:
+			gpr = (s64)(s32)gpr;
+			break;
+#endif
+		case 2:
+			gpr = (s64)(s16)gpr;
+			break;
+		case 1:
+			gpr = (s64)(s8)gpr;
+			break;
+		}
+	}
+
+	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
+	case KVM_MMIO_REG_GPR:
+		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
+		break;
+	case KVM_MMIO_REG_FPR:
+		if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+			vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu, MSR_FP);
+
+		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
+		break;
+#ifdef CONFIG_PPC_BOOK3S
+	case KVM_MMIO_REG_QPR:
+		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		break;
+	case KVM_MMIO_REG_FQPR:
+		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
+		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		break;
+#endif
+#ifdef CONFIG_VSX
+	case KVM_MMIO_REG_VSX:
+		if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+			vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu, MSR_VSX);
+
+		if (vcpu->arch.mmio_copy_type == KVMPPC_VSX_COPY_DWORD)
+			kvmppc_set_vsr_dword(vcpu, gpr);
+		else if (vcpu->arch.mmio_copy_type == KVMPPC_VSX_COPY_WORD)
+			kvmppc_set_vsr_word(vcpu, gpr);
+		else if (vcpu->arch.mmio_copy_type ==
+				KVMPPC_VSX_COPY_DWORD_LOAD_DUMP)
+			kvmppc_set_vsr_dword_dump(vcpu, gpr);
+		else if (vcpu->arch.mmio_copy_type ==
+				KVMPPC_VSX_COPY_WORD_LOAD_DUMP)
+			kvmppc_set_vsr_word_dump(vcpu, gpr);
+		break;
+#endif
+#ifdef CONFIG_ALTIVEC
+	case KVM_MMIO_REG_VMX:
+		if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+			vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu, MSR_VEC);
+
+		if (vcpu->arch.mmio_copy_type == KVMPPC_VMX_COPY_DWORD)
+			kvmppc_set_vmx_dword(vcpu, gpr);
+		else if (vcpu->arch.mmio_copy_type == KVMPPC_VMX_COPY_WORD)
+			kvmppc_set_vmx_word(vcpu, gpr);
+		else if (vcpu->arch.mmio_copy_type ==
+				KVMPPC_VMX_COPY_HWORD)
+			kvmppc_set_vmx_hword(vcpu, gpr);
+		else if (vcpu->arch.mmio_copy_type ==
+				KVMPPC_VMX_COPY_BYTE)
+			kvmppc_set_vmx_byte(vcpu, gpr);
+		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_MMIO_REG_NESTED_GPR:
+		if (kvmppc_need_byteswap(vcpu))
+			gpr = swab64(gpr);
+		kvm_vcpu_write_guest(vcpu, vcpu->arch.nested_io_gpr, &gpr,
+				     sizeof(gpr));
+		break;
+#endif
+	default:
+		BUG();
+	}
+}
+
+static int __kvmppc_handle_load(struct kvm_vcpu *vcpu,
+				unsigned int rt, unsigned int bytes,
+				int is_default_endian, int sign_extend)
+{
+	struct kvm_run *run = vcpu->run;
+	int idx, ret;
+	bool host_swabbed;
+
+	/* Pity C doesn't have a logical XOR operator */
+	if (kvmppc_need_byteswap(vcpu)) {
+		host_swabbed = is_default_endian;
+	} else {
+		host_swabbed = !is_default_endian;
+	}
+
+	if (bytes > sizeof(run->mmio.data))
+		return EMULATE_FAIL;
+
+	run->mmio.phys_addr = vcpu->arch.paddr_accessed;
+	run->mmio.len = bytes;
+	run->mmio.is_write = 0;
+
+	vcpu->arch.io_gpr = rt;
+	vcpu->arch.mmio_host_swabbed = host_swabbed;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = 0;
+	vcpu->arch.mmio_sign_extend = sign_extend;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
+			      bytes, &run->mmio.data);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!ret) {
+		kvmppc_complete_mmio_load(vcpu);
+		vcpu->mmio_needed = 0;
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_DO_MMIO;
+}
+
+int kvmppc_handle_load(struct kvm_vcpu *vcpu,
+		       unsigned int rt, unsigned int bytes,
+		       int is_default_endian)
+{
+	return __kvmppc_handle_load(vcpu, rt, bytes, is_default_endian, 0);
+}
+EXPORT_SYMBOL_GPL(kvmppc_handle_load);
+
+/* Same as above, but sign extends */
+int kvmppc_handle_loads(struct kvm_vcpu *vcpu,
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian)
+{
+	return __kvmppc_handle_load(vcpu, rt, bytes, is_default_endian, 1);
+}
+
+#ifdef CONFIG_VSX
+int kvmppc_handle_vsx_load(struct kvm_vcpu *vcpu,
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+	if (vcpu->arch.mmio_vsx_copy_nums > 4)
+		return EMULATE_FAIL;
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		emulated = __kvmppc_handle_load(vcpu, rt, bytes,
+			is_default_endian, mmio_sign_extend);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += vcpu->run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+	return emulated;
+}
+#endif /* CONFIG_VSX */
+
+int kvmppc_handle_store(struct kvm_vcpu *vcpu,
+			u64 val, unsigned int bytes, int is_default_endian)
+{
+	struct kvm_run *run = vcpu->run;
+	void *data = run->mmio.data;
+	int idx, ret;
+	bool host_swabbed;
+
+	/* Pity C doesn't have a logical XOR operator */
+	if (kvmppc_need_byteswap(vcpu)) {
+		host_swabbed = is_default_endian;
+	} else {
+		host_swabbed = !is_default_endian;
+	}
+
+	if (bytes > sizeof(run->mmio.data))
+		return EMULATE_FAIL;
+
+	run->mmio.phys_addr = vcpu->arch.paddr_accessed;
+	run->mmio.len = bytes;
+	run->mmio.is_write = 1;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = 1;
+
+	if ((vcpu->arch.mmio_sp64_extend) && (bytes == 4))
+		val = dp_to_sp(val);
+
+	/* Store the value at the lowest bytes in 'data'. */
+	if (!host_swabbed) {
+		switch (bytes) {
+		case 8: *(u64 *)data = val; break;
+		case 4: *(u32 *)data = val; break;
+		case 2: *(u16 *)data = val; break;
+		case 1: *(u8  *)data = val; break;
+		}
+	} else {
+		switch (bytes) {
+		case 8: *(u64 *)data = swab64(val); break;
+		case 4: *(u32 *)data = swab32(val); break;
+		case 2: *(u16 *)data = swab16(val); break;
+		case 1: *(u8  *)data = val; break;
+		}
+	}
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
+			       bytes, &run->mmio.data);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!ret) {
+		vcpu->mmio_needed = 0;
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_DO_MMIO;
+}
+EXPORT_SYMBOL_GPL(kvmppc_handle_store);
+
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
+{
+	u32 dword_offset, word_offset;
+	union kvmppc_one_reg reg;
+	int vsx_offset = 0;
+	int copy_type = vcpu->arch.mmio_copy_type;
+	int result = 0;
+
+	switch (copy_type) {
+	case KVMPPC_VSX_COPY_DWORD:
+		vsx_offset =
+			kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (rs < 32) {
+			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
+			*val = reg.vsxval[vsx_offset];
+		}
+		break;
+
+	case KVMPPC_VSX_COPY_WORD:
+		vsx_offset =
+			kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (rs < 32) {
+			dword_offset = vsx_offset / 2;
+			word_offset = vsx_offset % 2;
+			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
+			*val = reg.vsx32val[word_offset];
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
+			*val = reg.vsx32val[vsx_offset];
+		}
+		break;
+
+	default:
+		result = -1;
+		break;
+	}
+
+	return result;
+}
+
+int kvmppc_handle_vsx_store(struct kvm_vcpu *vcpu,
+			int rs, unsigned int bytes, int is_default_endian)
+{
+	u64 val;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	vcpu->arch.io_gpr = rs;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+	if (vcpu->arch.mmio_vsx_copy_nums > 4)
+		return EMULATE_FAIL;
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
+			return EMULATE_FAIL;
+
+		emulated = kvmppc_handle_store(vcpu,
+			 val, bytes, is_default_endian);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += vcpu->run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	enum emulation_result emulated = EMULATE_FAIL;
+	int r;
+
+	vcpu->arch.paddr_accessed += run->mmio.len;
+
+	if (!vcpu->mmio_is_write) {
+		emulated = kvmppc_handle_vsx_load(vcpu, vcpu->arch.io_gpr,
+			 run->mmio.len, 1, vcpu->arch.mmio_sign_extend);
+	} else {
+		emulated = kvmppc_handle_vsx_store(vcpu,
+			 vcpu->arch.io_gpr, run->mmio.len, 1);
+	}
+
+	switch (emulated) {
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST;
+		break;
+	case EMULATE_FAIL:
+		pr_info("KVM: MMIO emulation failed (VSX repeat)\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		r = RESUME_HOST;
+		break;
+	default:
+		r = RESUME_GUEST;
+		break;
+	}
+	return r;
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_ALTIVEC
+int kvmppc_handle_vmx_load(struct kvm_vcpu *vcpu,
+		unsigned int rt, unsigned int bytes, int is_default_endian)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+
+	if (vcpu->arch.mmio_vmx_copy_nums > 2)
+		return EMULATE_FAIL;
+
+	while (vcpu->arch.mmio_vmx_copy_nums) {
+		emulated = __kvmppc_handle_load(vcpu, rt, bytes,
+				is_default_endian, 0);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += vcpu->run->mmio.len;
+		vcpu->arch.mmio_vmx_copy_nums--;
+		vcpu->arch.mmio_vmx_offset++;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_get_vmx_dword(struct kvm_vcpu *vcpu, int index, u64 *val)
+{
+	union kvmppc_one_reg reg;
+	int vmx_offset = 0;
+	int result = 0;
+
+	vmx_offset =
+		kvmppc_get_vmx_dword_offset(vcpu, vcpu->arch.mmio_vmx_offset);
+
+	if (vmx_offset == -1)
+		return -1;
+
+	reg.vval = VCPU_VSX_VR(vcpu, index);
+	*val = reg.vsxval[vmx_offset];
+
+	return result;
+}
+
+static int kvmppc_get_vmx_word(struct kvm_vcpu *vcpu, int index, u64 *val)
+{
+	union kvmppc_one_reg reg;
+	int vmx_offset = 0;
+	int result = 0;
+
+	vmx_offset =
+		kvmppc_get_vmx_word_offset(vcpu, vcpu->arch.mmio_vmx_offset);
+
+	if (vmx_offset == -1)
+		return -1;
+
+	reg.vval = VCPU_VSX_VR(vcpu, index);
+	*val = reg.vsx32val[vmx_offset];
+
+	return result;
+}
+
+static int kvmppc_get_vmx_hword(struct kvm_vcpu *vcpu, int index, u64 *val)
+{
+	union kvmppc_one_reg reg;
+	int vmx_offset = 0;
+	int result = 0;
+
+	vmx_offset =
+		kvmppc_get_vmx_hword_offset(vcpu, vcpu->arch.mmio_vmx_offset);
+
+	if (vmx_offset == -1)
+		return -1;
+
+	reg.vval = VCPU_VSX_VR(vcpu, index);
+	*val = reg.vsx16val[vmx_offset];
+
+	return result;
+}
+
+static int kvmppc_get_vmx_byte(struct kvm_vcpu *vcpu, int index, u64 *val)
+{
+	union kvmppc_one_reg reg;
+	int vmx_offset = 0;
+	int result = 0;
+
+	vmx_offset =
+		kvmppc_get_vmx_byte_offset(vcpu, vcpu->arch.mmio_vmx_offset);
+
+	if (vmx_offset == -1)
+		return -1;
+
+	reg.vval = VCPU_VSX_VR(vcpu, index);
+	*val = reg.vsx8val[vmx_offset];
+
+	return result;
+}
+
+int kvmppc_handle_vmx_store(struct kvm_vcpu *vcpu,
+		unsigned int rs, unsigned int bytes, int is_default_endian)
+{
+	u64 val = 0;
+	unsigned int index = rs & KVM_MMIO_REG_MASK;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	if (vcpu->arch.mmio_vmx_copy_nums > 2)
+		return EMULATE_FAIL;
+
+	vcpu->arch.io_gpr = rs;
+
+	while (vcpu->arch.mmio_vmx_copy_nums) {
+		switch (vcpu->arch.mmio_copy_type) {
+		case KVMPPC_VMX_COPY_DWORD:
+			if (kvmppc_get_vmx_dword(vcpu, index, &val) == -1)
+				return EMULATE_FAIL;
+
+			break;
+		case KVMPPC_VMX_COPY_WORD:
+			if (kvmppc_get_vmx_word(vcpu, index, &val) == -1)
+				return EMULATE_FAIL;
+			break;
+		case KVMPPC_VMX_COPY_HWORD:
+			if (kvmppc_get_vmx_hword(vcpu, index, &val) == -1)
+				return EMULATE_FAIL;
+			break;
+		case KVMPPC_VMX_COPY_BYTE:
+			if (kvmppc_get_vmx_byte(vcpu, index, &val) == -1)
+				return EMULATE_FAIL;
+			break;
+		default:
+			return EMULATE_FAIL;
+		}
+
+		emulated = kvmppc_handle_store(vcpu, val, bytes,
+				is_default_endian);
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += vcpu->run->mmio.len;
+		vcpu->arch.mmio_vmx_copy_nums--;
+		vcpu->arch.mmio_vmx_offset++;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mmio_vmx_loadstore(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	enum emulation_result emulated = EMULATE_FAIL;
+	int r;
+
+	vcpu->arch.paddr_accessed += run->mmio.len;
+
+	if (!vcpu->mmio_is_write) {
+		emulated = kvmppc_handle_vmx_load(vcpu,
+				vcpu->arch.io_gpr, run->mmio.len, 1);
+	} else {
+		emulated = kvmppc_handle_vmx_store(vcpu,
+				vcpu->arch.io_gpr, run->mmio.len, 1);
+	}
+
+	switch (emulated) {
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST;
+		break;
+	case EMULATE_FAIL:
+		pr_info("KVM: MMIO emulation failed (VMX repeat)\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		r = RESUME_HOST;
+		break;
+	default:
+		r = RESUME_GUEST;
+		break;
+	}
+	return r;
+}
+#endif /* CONFIG_ALTIVEC */
+
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r = 0;
+	union kvmppc_one_reg val;
+	int size;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	r = kvmppc_get_one_reg(vcpu, reg->id, &val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (reg->id) {
+#ifdef CONFIG_ALTIVEC
+		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0];
+			break;
+		case KVM_REG_PPC_VSCR:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]);
+			break;
+		case KVM_REG_PPC_VRSAVE:
+			val = get_reg_val(reg->id, vcpu->arch.vrsave);
+			break;
+#endif /* CONFIG_ALTIVEC */
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	if (r)
+		return r;
+
+	if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+		r = -EFAULT;
+
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r;
+	union kvmppc_one_reg val;
+	int size;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+		return -EFAULT;
+
+	r = kvmppc_set_one_reg(vcpu, reg->id, &val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (reg->id) {
+#ifdef CONFIG_ALTIVEC
+		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
+			break;
+		case KVM_REG_PPC_VSCR:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val);
+			break;
+		case KVM_REG_PPC_VRSAVE:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vrsave = set_reg_val(reg->id, val);
+			break;
+#endif /* CONFIG_ALTIVEC */
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	int r;
+
+	vcpu_load(vcpu);
+
+	if (vcpu->mmio_needed) {
+		vcpu->mmio_needed = 0;
+		if (!vcpu->mmio_is_write)
+			kvmppc_complete_mmio_load(vcpu);
+#ifdef CONFIG_VSX
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			vcpu->arch.mmio_vsx_copy_nums--;
+			vcpu->arch.mmio_vsx_offset++;
+		}
+
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			r = kvmppc_emulate_mmio_vsx_loadstore(vcpu);
+			if (r == RESUME_HOST) {
+				vcpu->mmio_needed = 1;
+				goto out;
+			}
+		}
+#endif
+#ifdef CONFIG_ALTIVEC
+		if (vcpu->arch.mmio_vmx_copy_nums > 0) {
+			vcpu->arch.mmio_vmx_copy_nums--;
+			vcpu->arch.mmio_vmx_offset++;
+		}
+
+		if (vcpu->arch.mmio_vmx_copy_nums > 0) {
+			r = kvmppc_emulate_mmio_vmx_loadstore(vcpu);
+			if (r == RESUME_HOST) {
+				vcpu->mmio_needed = 1;
+				goto out;
+			}
+		}
+#endif
+	} else if (vcpu->arch.osi_needed) {
+		u64 *gprs = run->osi.gprs;
+		int i;
+
+		for (i = 0; i < 32; i++)
+			kvmppc_set_gpr(vcpu, i, gprs[i]);
+		vcpu->arch.osi_needed = 0;
+	} else if (vcpu->arch.hcall_needed) {
+		int i;
+
+		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+		for (i = 0; i < 9; ++i)
+			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+		vcpu->arch.hcall_needed = 0;
+#ifdef CONFIG_BOOKE
+	} else if (vcpu->arch.epr_needed) {
+		kvmppc_set_epr(vcpu, run->epr.epr);
+		vcpu->arch.epr_needed = 0;
+#endif
+	}
+
+	kvm_sigset_activate(vcpu);
+
+	if (run->immediate_exit)
+		r = -EINTR;
+	else
+		r = kvmppc_vcpu_run(vcpu);
+
+	kvm_sigset_deactivate(vcpu);
+
+#ifdef CONFIG_ALTIVEC
+out:
+#endif
+
+	/*
+	 * We're already returning to userspace, don't pass the
+	 * RESUME_HOST flags along.
+	 */
+	if (r > 0)
+		r = 0;
+
+	vcpu_put(vcpu);
+	return r;
+}
+
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
+{
+	if (irq->irq == KVM_INTERRUPT_UNSET) {
+		kvmppc_core_dequeue_external(vcpu);
+		return 0;
+	}
+
+	kvmppc_core_queue_external(vcpu, irq);
+
+	kvm_vcpu_kick(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+				     struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_PPC_OSI:
+		r = 0;
+		vcpu->arch.osi_enabled = true;
+		break;
+	case KVM_CAP_PPC_PAPR:
+		r = 0;
+		vcpu->arch.papr_enabled = true;
+		break;
+	case KVM_CAP_PPC_EPR:
+		r = 0;
+		if (cap->args[0])
+			vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+		else
+			vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
+		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+		r = 0;
+		vcpu->arch.watchdog_enabled = true;
+		break;
+#endif
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	case KVM_CAP_SW_TLB: {
+		struct kvm_config_tlb cfg;
+		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
+
+		r = -EFAULT;
+		if (copy_from_user(&cfg, user_ptr, sizeof(cfg)))
+			break;
+
+		r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg);
+		break;
+	}
+#endif
+#ifdef CONFIG_KVM_MPIC
+	case KVM_CAP_IRQ_MPIC: {
+		struct fd f;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		f = fdget(cap->args[0]);
+		if (!f.file)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(f.file);
+		if (dev)
+			r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fdput(f);
+		break;
+	}
+#endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS: {
+		struct fd f;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		f = fdget(cap->args[0]);
+		if (!f.file)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(f.file);
+		if (dev) {
+			if (xics_on_xive())
+				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
+			else
+				r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		}
+
+		fdput(f);
+		break;
+	}
+#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+	case KVM_CAP_PPC_IRQ_XIVE: {
+		struct fd f;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		f = fdget(cap->args[0]);
+		if (!f.file)
+			break;
+
+		r = -ENXIO;
+		if (!xive_enabled())
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(f.file);
+		if (dev)
+			r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
+							    cap->args[1]);
+
+		fdput(f);
+		break;
+	}
+#endif /* CONFIG_KVM_XIVE */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_FWNMI:
+		r = -EINVAL;
+		if (!is_kvmppc_hv_enabled(vcpu->kvm))
+			break;
+		r = 0;
+		vcpu->kvm->arch.fwnmi_enabled = true;
+		break;
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	if (!r)
+		r = kvmppc_sanity_check(vcpu);
+
+	return r;
+}
+
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+	if (kvm->arch.mpic)
+		return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+	if (kvm->arch.xics || kvm->arch.xive)
+		return true;
+#endif
+	return false;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+	return -EINVAL;
+}
+
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+			       unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+
+	if (ioctl == KVM_INTERRUPT) {
+		struct kvm_interrupt irq;
+		if (copy_from_user(&irq, argp, sizeof(irq)))
+			return -EFAULT;
+		return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+	}
+	return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
+
+	switch (ioctl) {
+	case KVM_ENABLE_CAP:
+	{
+		struct kvm_enable_cap cap;
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		vcpu_load(vcpu);
+		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+		vcpu_put(vcpu);
+		break;
+	}
+
+	case KVM_SET_ONE_REG:
+	case KVM_GET_ONE_REG:
+	{
+		struct kvm_one_reg reg;
+		r = -EFAULT;
+		if (copy_from_user(&reg, argp, sizeof(reg)))
+			goto out;
+		if (ioctl == KVM_SET_ONE_REG)
+			r = kvm_vcpu_ioctl_set_one_reg(vcpu, &reg);
+		else
+			r = kvm_vcpu_ioctl_get_one_reg(vcpu, &reg);
+		break;
+	}
+
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	case KVM_DIRTY_TLB: {
+		struct kvm_dirty_tlb dirty;
+		r = -EFAULT;
+		if (copy_from_user(&dirty, argp, sizeof(dirty)))
+			goto out;
+		vcpu_load(vcpu);
+		r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty);
+		vcpu_put(vcpu);
+		break;
+	}
+#endif
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	return r;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
+{
+	u32 inst_nop = 0x60000000;
+#ifdef CONFIG_KVM_BOOKE_HV
+	u32 inst_sc1 = 0x44000022;
+	pvinfo->hcall[0] = cpu_to_be32(inst_sc1);
+	pvinfo->hcall[1] = cpu_to_be32(inst_nop);
+	pvinfo->hcall[2] = cpu_to_be32(inst_nop);
+	pvinfo->hcall[3] = cpu_to_be32(inst_nop);
+#else
+	u32 inst_lis = 0x3c000000;
+	u32 inst_ori = 0x60000000;
+	u32 inst_sc = 0x44000002;
+	u32 inst_imm_mask = 0xffff;
+
+	/*
+	 * The hypercall to get into KVM from within guest context is as
+	 * follows:
+	 *
+	 *    lis r0, r0, KVM_SC_MAGIC_R0@h
+	 *    ori r0, KVM_SC_MAGIC_R0@l
+	 *    sc
+	 *    nop
+	 */
+	pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask));
+	pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask));
+	pvinfo->hcall[2] = cpu_to_be32(inst_sc);
+	pvinfo->hcall[3] = cpu_to_be32(inst_nop);
+#endif
+
+	pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
+
+	return 0;
+}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+	int ret = 0;
+
+#ifdef CONFIG_KVM_MPIC
+	ret = ret || (kvm->arch.mpic != NULL);
+#endif
+#ifdef CONFIG_KVM_XICS
+	ret = ret || (kvm->arch.xics != NULL);
+	ret = ret || (kvm->arch.xive != NULL);
+#endif
+	smp_rmb();
+	return ret;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+			  bool line_status)
+{
+	if (!kvm_arch_irqchip_in_kernel(kvm))
+		return -ENXIO;
+
+	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event->irq, irq_event->level,
+					line_status);
+	return 0;
+}
+
+
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+			    struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	case KVM_CAP_PPC_ENABLE_HCALL: {
+		unsigned long hcall = cap->args[0];
+
+		r = -EINVAL;
+		if (hcall > MAX_HCALL_OPCODE || (hcall & 3) ||
+		    cap->args[1] > 1)
+			break;
+		if (!kvmppc_book3s_hcall_implemented(kvm, hcall))
+			break;
+		if (cap->args[1])
+			set_bit(hcall / 4, kvm->arch.enabled_hcalls);
+		else
+			clear_bit(hcall / 4, kvm->arch.enabled_hcalls);
+		r = 0;
+		break;
+	}
+	case KVM_CAP_PPC_SMT: {
+		unsigned long mode = cap->args[0];
+		unsigned long flags = cap->args[1];
+
+		r = -EINVAL;
+		if (kvm->arch.kvm_ops->set_smt_mode)
+			r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
+		break;
+	}
+
+	case KVM_CAP_PPC_NESTED_HV:
+		r = -EINVAL;
+		if (!is_kvmppc_hv_enabled(kvm) ||
+		    !kvm->arch.kvm_ops->enable_nested)
+			break;
+		r = kvm->arch.kvm_ops->enable_nested(kvm);
+		break;
+#endif
+#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+	case KVM_CAP_PPC_SECURE_GUEST:
+		r = -EINVAL;
+		if (!is_kvmppc_hv_enabled(kvm) || !kvm->arch.kvm_ops->enable_svm)
+			break;
+		r = kvm->arch.kvm_ops->enable_svm(kvm);
+		break;
+	case KVM_CAP_PPC_DAWR1:
+		r = -EINVAL;
+		if (!is_kvmppc_hv_enabled(kvm) || !kvm->arch.kvm_ops->enable_dawr1)
+			break;
+		r = kvm->arch.kvm_ops->enable_dawr1(kvm);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * These functions check whether the underlying hardware is safe
+ * against attacks based on observing the effects of speculatively
+ * executed instructions, and whether it supplies instructions for
+ * use in workarounds.  The information comes from firmware, either
+ * via the device tree on powernv platforms or from an hcall on
+ * pseries platforms.
+ */
+#ifdef CONFIG_PPC_PSERIES
+static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+	struct h_cpu_char_result c;
+	unsigned long rc;
+
+	if (!machine_is(pseries))
+		return -ENOTTY;
+
+	rc = plpar_get_cpu_characteristics(&c);
+	if (rc == H_SUCCESS) {
+		cp->character = c.character;
+		cp->behaviour = c.behaviour;
+		cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
+			KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
+			KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
+			KVM_PPC_CPU_CHAR_BR_HINT_HONOURED |
+			KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF |
+			KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS |
+			KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
+		cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
+			KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
+			KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR |
+			KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
+	}
+	return 0;
+}
+#else
+static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+	return -ENOTTY;
+}
+#endif
+
+static inline bool have_fw_feat(struct device_node *fw_features,
+				const char *state, const char *name)
+{
+	struct device_node *np;
+	bool r = false;
+
+	np = of_get_child_by_name(fw_features, name);
+	if (np) {
+		r = of_property_read_bool(np, state);
+		of_node_put(np);
+	}
+	return r;
+}
+
+static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp)
+{
+	struct device_node *np, *fw_features;
+	int r;
+
+	memset(cp, 0, sizeof(*cp));
+	r = pseries_get_cpu_char(cp);
+	if (r != -ENOTTY)
+		return r;
+
+	np = of_find_node_by_name(NULL, "ibm,opal");
+	if (np) {
+		fw_features = of_get_child_by_name(np, "fw-features");
+		of_node_put(np);
+		if (!fw_features)
+			return 0;
+		if (have_fw_feat(fw_features, "enabled",
+				 "inst-spec-barrier-ori31,31,0"))
+			cp->character |= KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-bcctrl-serialized"))
+			cp->character |= KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED;
+		if (have_fw_feat(fw_features, "enabled",
+				 "inst-l1d-flush-ori30,30,0"))
+			cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30;
+		if (have_fw_feat(fw_features, "enabled",
+				 "inst-l1d-flush-trig2"))
+			cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-l1d-thread-split"))
+			cp->character |= KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-count-cache-disabled"))
+			cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+		if (have_fw_feat(fw_features, "enabled",
+				 "fw-count-cache-flush-bcctr2,0,0"))
+			cp->character |= KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
+		cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
+			KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
+			KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
+			KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
+			KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS |
+			KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
+
+		if (have_fw_feat(fw_features, "enabled",
+				 "speculation-policy-favor-security"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY;
+		if (!have_fw_feat(fw_features, "disabled",
+				  "needs-l1d-flush-msr-pr-0-to-1"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR;
+		if (!have_fw_feat(fw_features, "disabled",
+				  "needs-spec-barrier-for-bound-checks"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+		if (have_fw_feat(fw_features, "enabled",
+				 "needs-count-cache-flush-on-context-switch"))
+			cp->behaviour |= KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
+		cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
+			KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
+			KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR |
+			KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
+
+		of_node_put(fw_features);
+	}
+
+	return 0;
+}
+#endif
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm __maybe_unused = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	switch (ioctl) {
+	case KVM_PPC_GET_PVINFO: {
+		struct kvm_ppc_pvinfo pvinfo;
+		memset(&pvinfo, 0, sizeof(pvinfo));
+		r = kvm_vm_ioctl_get_pvinfo(&pvinfo);
+		if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		break;
+	}
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case KVM_CREATE_SPAPR_TCE_64: {
+		struct kvm_create_spapr_tce_64 create_tce_64;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64)))
+			goto out;
+		if (create_tce_64.flags) {
+			r = -EINVAL;
+			goto out;
+		}
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+		goto out;
+	}
+	case KVM_CREATE_SPAPR_TCE: {
+		struct kvm_create_spapr_tce create_tce;
+		struct kvm_create_spapr_tce_64 create_tce_64;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+			goto out;
+
+		create_tce_64.liobn = create_tce.liobn;
+		create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.offset = 0;
+		create_tce_64.size = create_tce.window_size >>
+				IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.flags = 0;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+		goto out;
+	}
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_PPC_GET_SMMU_INFO: {
+		struct kvm_ppc_smmu_info info;
+		struct kvm *kvm = filp->private_data;
+
+		memset(&info, 0, sizeof(info));
+		r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
+	case KVM_PPC_RTAS_DEFINE_TOKEN: {
+		struct kvm *kvm = filp->private_data;
+
+		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
+		break;
+	}
+	case KVM_PPC_CONFIGURE_V3_MMU: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_mmuv3_cfg cfg;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->configure_mmu)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(&cfg, argp, sizeof(cfg)))
+			goto out;
+		r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
+		break;
+	}
+	case KVM_PPC_GET_RMMU_INFO: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_rmmu_info info;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->get_rmmu_info)
+			goto out;
+		r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
+	case KVM_PPC_GET_CPU_CHAR: {
+		struct kvm_ppc_cpu_char cpuchar;
+
+		r = kvmppc_get_cpu_char(&cpuchar);
+		if (r >= 0 && copy_to_user(argp, &cpuchar, sizeof(cpuchar)))
+			r = -EFAULT;
+		break;
+	}
+	case KVM_PPC_SVM_OFF: {
+		struct kvm *kvm = filp->private_data;
+
+		r = 0;
+		if (!kvm->arch.kvm_ops->svm_off)
+			goto out;
+
+		r = kvm->arch.kvm_ops->svm_off(kvm);
+		break;
+	}
+	default: {
+		struct kvm *kvm = filp->private_data;
+		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
+	}
+#else /* CONFIG_PPC_BOOK3S_64 */
+	default:
+		r = -ENOTTY;
+#endif
+	}
+out:
+	return r;
+}
+
+static DEFINE_IDA(lpid_inuse);
+static unsigned long nr_lpids;
+
+long kvmppc_alloc_lpid(void)
+{
+	int lpid;
+
+	/* The host LPID must always be 0 (allocation starts at 1) */
+	lpid = ida_alloc_range(&lpid_inuse, 1, nr_lpids - 1, GFP_KERNEL);
+	if (lpid < 0) {
+		if (lpid == -ENOMEM)
+			pr_err("%s: Out of memory\n", __func__);
+		else
+			pr_err("%s: No LPIDs free\n", __func__);
+		return -ENOMEM;
+	}
+
+	return lpid;
+}
+EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid);
+
+void kvmppc_free_lpid(long lpid)
+{
+	ida_free(&lpid_inuse, lpid);
+}
+EXPORT_SYMBOL_GPL(kvmppc_free_lpid);
+
+/* nr_lpids_param includes the host LPID */
+void kvmppc_init_lpid(unsigned long nr_lpids_param)
+{
+	nr_lpids = nr_lpids_param;
+}
+EXPORT_SYMBOL_GPL(kvmppc_init_lpid);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr);
+
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
+{
+	if (vcpu->kvm->arch.kvm_ops->create_vcpu_debugfs)
+		vcpu->kvm->arch.kvm_ops->create_vcpu_debugfs(vcpu, debugfs_dentry);
+}
+
+int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+	if (kvm->arch.kvm_ops->create_vm_debugfs)
+		kvm->arch.kvm_ops->create_vm_debugfs(kvm);
+	return 0;
+}
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
new file mode 100644
index 0000000000..25071331f8
--- /dev/null
+++ b/arch/powerpc/kvm/timing.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+
+#include <asm/time.h>
+#include <asm-generic/div64.h>
+
+#include "timing.h"
+
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	/* Take a lock to avoid concurrent updates */
+	mutex_lock(&vcpu->arch.exit_timing_lock);
+
+	vcpu->arch.last_exit_type = 0xDEAD;
+	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+		vcpu->arch.timing_count_type[i] = 0;
+		vcpu->arch.timing_max_duration[i] = 0;
+		vcpu->arch.timing_min_duration[i] = 0xFFFFFFFF;
+		vcpu->arch.timing_sum_duration[i] = 0;
+		vcpu->arch.timing_sum_quad_duration[i] = 0;
+	}
+	vcpu->arch.timing_last_exit = 0;
+	vcpu->arch.timing_exit.tv64 = 0;
+	vcpu->arch.timing_last_enter.tv64 = 0;
+
+	mutex_unlock(&vcpu->arch.exit_timing_lock);
+}
+
+static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
+{
+	u64 old;
+
+	mutex_lock(&vcpu->arch.exit_timing_lock);
+
+	vcpu->arch.timing_count_type[type]++;
+
+	/* sum */
+	old = vcpu->arch.timing_sum_duration[type];
+	vcpu->arch.timing_sum_duration[type] += duration;
+	if (unlikely(old > vcpu->arch.timing_sum_duration[type])) {
+		printk(KERN_ERR"%s - wrap adding sum of durations"
+			" old %lld new %lld type %d exit # of type %d\n",
+			__func__, old, vcpu->arch.timing_sum_duration[type],
+			type, vcpu->arch.timing_count_type[type]);
+	}
+
+	/* square sum */
+	old = vcpu->arch.timing_sum_quad_duration[type];
+	vcpu->arch.timing_sum_quad_duration[type] += (duration*duration);
+	if (unlikely(old > vcpu->arch.timing_sum_quad_duration[type])) {
+		printk(KERN_ERR"%s - wrap adding sum of squared durations"
+			" old %lld new %lld type %d exit # of type %d\n",
+			__func__, old,
+			vcpu->arch.timing_sum_quad_duration[type],
+			type, vcpu->arch.timing_count_type[type]);
+	}
+
+	/* set min/max */
+	if (unlikely(duration < vcpu->arch.timing_min_duration[type]))
+		vcpu->arch.timing_min_duration[type] = duration;
+	if (unlikely(duration > vcpu->arch.timing_max_duration[type]))
+		vcpu->arch.timing_max_duration[type] = duration;
+
+	mutex_unlock(&vcpu->arch.exit_timing_lock);
+}
+
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
+{
+	u64 exit = vcpu->arch.timing_last_exit;
+	u64 enter = vcpu->arch.timing_last_enter.tv64;
+
+	/* save exit time, used next exit when the reenter time is known */
+	vcpu->arch.timing_last_exit = vcpu->arch.timing_exit.tv64;
+
+	if (unlikely(vcpu->arch.last_exit_type == 0xDEAD || exit == 0))
+		return; /* skip incomplete cycle (e.g. after reset) */
+
+	/* update statistics for average and standard deviation */
+	add_exit_timing(vcpu, (enter - exit), vcpu->arch.last_exit_type);
+	/* enter -> timing_last_exit is time spent in guest - log this too */
+	add_exit_timing(vcpu, (vcpu->arch.timing_last_exit - enter),
+			TIMEINGUEST);
+}
+
+static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
+	[MMIO_EXITS] =              "MMIO",
+	[SIGNAL_EXITS] =            "SIGNAL",
+	[ITLB_REAL_MISS_EXITS] =    "ITLBREAL",
+	[ITLB_VIRT_MISS_EXITS] =    "ITLBVIRT",
+	[DTLB_REAL_MISS_EXITS] =    "DTLBREAL",
+	[DTLB_VIRT_MISS_EXITS] =    "DTLBVIRT",
+	[SYSCALL_EXITS] =           "SYSCALL",
+	[ISI_EXITS] =               "ISI",
+	[DSI_EXITS] =               "DSI",
+	[EMULATED_INST_EXITS] =     "EMULINST",
+	[EMULATED_MTMSRWE_EXITS] =  "EMUL_WAIT",
+	[EMULATED_WRTEE_EXITS] =    "EMUL_WRTEE",
+	[EMULATED_MTSPR_EXITS] =    "EMUL_MTSPR",
+	[EMULATED_MFSPR_EXITS] =    "EMUL_MFSPR",
+	[EMULATED_MTMSR_EXITS] =    "EMUL_MTMSR",
+	[EMULATED_MFMSR_EXITS] =    "EMUL_MFMSR",
+	[EMULATED_TLBSX_EXITS] =    "EMUL_TLBSX",
+	[EMULATED_TLBWE_EXITS] =    "EMUL_TLBWE",
+	[EMULATED_RFI_EXITS] =      "EMUL_RFI",
+	[DEC_EXITS] =               "DEC",
+	[EXT_INTR_EXITS] =          "EXTINT",
+	[HALT_WAKEUP] =             "HALT",
+	[USR_PR_INST] =             "USR_PR_INST",
+	[FP_UNAVAIL] =              "FP_UNAVAIL",
+	[DEBUG_EXITS] =             "DEBUG",
+	[TIMEINGUEST] =             "TIMEINGUEST"
+};
+
+static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
+{
+	struct kvm_vcpu *vcpu = m->private;
+	int i;
+	u64 min, max, sum, sum_quad;
+
+	seq_puts(m, "type	count	min	max	sum	sum_squared\n");
+
+	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+
+		min = vcpu->arch.timing_min_duration[i];
+		do_div(min, tb_ticks_per_usec);
+		max = vcpu->arch.timing_max_duration[i];
+		do_div(max, tb_ticks_per_usec);
+		sum = vcpu->arch.timing_sum_duration[i];
+		do_div(sum, tb_ticks_per_usec);
+		sum_quad = vcpu->arch.timing_sum_quad_duration[i];
+		do_div(sum_quad, tb_ticks_per_usec);
+
+		seq_printf(m, "%12s	%10d	%10lld	%10lld	%20lld	%20lld\n",
+			kvm_exit_names[i],
+			vcpu->arch.timing_count_type[i],
+			min,
+			max,
+			sum,
+			sum_quad);
+
+	}
+	return 0;
+}
+
+/* Write 'c' to clear the timing statistics. */
+static ssize_t kvmppc_exit_timing_write(struct file *file,
+				       const char __user *user_buf,
+				       size_t count, loff_t *ppos)
+{
+	int err = -EINVAL;
+	char c;
+
+	if (count > 1) {
+		goto done;
+	}
+
+	if (get_user(c, user_buf)) {
+		err = -EFAULT;
+		goto done;
+	}
+
+	if (c == 'c') {
+		struct seq_file *seqf = file->private_data;
+		struct kvm_vcpu *vcpu = seqf->private;
+		/* Write does not affect our buffers previously generated with
+		 * show. seq_file is locked here to prevent races of init with
+		 * a show call */
+		mutex_lock(&seqf->lock);
+		kvmppc_init_timing_stats(vcpu);
+		mutex_unlock(&seqf->lock);
+		err = count;
+	}
+
+done:
+	return err;
+}
+
+static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmppc_exit_timing_show, inode->i_private);
+}
+
+static const struct file_operations kvmppc_exit_timing_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kvmppc_exit_timing_open,
+	.read    = seq_read,
+	.write   = kvmppc_exit_timing_write,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+int kvmppc_create_vcpu_debugfs_e500(struct kvm_vcpu *vcpu,
+				    struct dentry *debugfs_dentry)
+{
+	debugfs_create_file("timing", 0666, debugfs_dentry,
+			    vcpu, &kvmppc_exit_timing_fops);
+	return 0;
+}
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
new file mode 100644
index 0000000000..45817ab82b
--- /dev/null
+++ b/arch/powerpc/kvm/timing.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#ifndef __POWERPC_KVM_EXITTIMING_H__
+#define __POWERPC_KVM_EXITTIMING_H__
+
+#include <linux/kvm_host.h>
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu);
+int kvmppc_create_vcpu_debugfs_e500(struct kvm_vcpu *vcpu,
+				    struct dentry *debugfs_dentry);
+
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type)
+{
+	vcpu->arch.last_exit_type = type;
+}
+
+#else
+/* if exit timing is not configured there is no need to build the c file */
+static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline int kvmppc_create_vcpu_debugfs_e500(struct kvm_vcpu *vcpu,
+						  struct dentry *debugfs_dentry)
+{
+	return 0;
+}
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
+#endif /* CONFIG_KVM_EXIT_TIMING */
+
+/* account the exit in kvm_stats */
+static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
+{
+	/* type has to be known at build time for optimization */
+
+	/* The BUILD_BUG_ON below breaks in funny ways, commented out
+	 * for now ... -BenH
+	BUILD_BUG_ON(!__builtin_constant_p(type));
+	*/
+	switch (type) {
+	case EXT_INTR_EXITS:
+		vcpu->stat.ext_intr_exits++;
+		break;
+	case DEC_EXITS:
+		vcpu->stat.dec_exits++;
+		break;
+	case EMULATED_INST_EXITS:
+		vcpu->stat.emulated_inst_exits++;
+		break;
+	case DSI_EXITS:
+		vcpu->stat.dsi_exits++;
+		break;
+	case ISI_EXITS:
+		vcpu->stat.isi_exits++;
+		break;
+	case SYSCALL_EXITS:
+		vcpu->stat.syscall_exits++;
+		break;
+	case DTLB_REAL_MISS_EXITS:
+		vcpu->stat.dtlb_real_miss_exits++;
+		break;
+	case DTLB_VIRT_MISS_EXITS:
+		vcpu->stat.dtlb_virt_miss_exits++;
+		break;
+	case MMIO_EXITS:
+		vcpu->stat.mmio_exits++;
+		break;
+	case ITLB_REAL_MISS_EXITS:
+		vcpu->stat.itlb_real_miss_exits++;
+		break;
+	case ITLB_VIRT_MISS_EXITS:
+		vcpu->stat.itlb_virt_miss_exits++;
+		break;
+	case SIGNAL_EXITS:
+		vcpu->stat.signal_exits++;
+		break;
+	case DBELL_EXITS:
+		vcpu->stat.dbell_exits++;
+		break;
+	case GDBELL_EXITS:
+		vcpu->stat.gdbell_exits++;
+		break;
+	}
+}
+
+/* wrapper to set exit time and account for it in kvm_stats */
+static inline void kvmppc_account_exit(struct kvm_vcpu *vcpu, int type)
+{
+	kvmppc_set_exit_type(vcpu, type);
+	kvmppc_account_exit_stat(vcpu, type);
+}
+
+#endif /* __POWERPC_KVM_EXITTIMING_H__ */
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
new file mode 100644
index 0000000000..b506c4d9a8
--- /dev/null
+++ b/arch/powerpc/kvm/tm.S
@@ -0,0 +1,398 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * Derived from book3s_hv_rmhandlers.S, which is:
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/tm.h>
+#include <asm/cputable.h>
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
+
+/*
+ * Save transactional state and TM-related registers.
+ * Called with:
+ * - r3 pointing to the vcpu struct
+ * - r4 containing the MSR with current TS bits:
+ * 	(For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
+ * - r5 containing a flag indicating that non-volatile registers
+ *	must be preserved.
+ * If r5 == 0, this can modify all checkpointed registers, but
+ * restores r1, r2 before exit.  If r5 != 0, this restores the
+ * MSR TM/FP/VEC/VSX bits to their state on entry.
+ */
+_GLOBAL(__kvmppc_save_tm)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu    r1, -SWITCH_FRAME_SIZE(r1)
+
+	mr	r9, r3
+	cmpdi	cr7, r5, 0
+
+	/* Turn on TM. */
+	mfmsr	r8
+	mr	r10, r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	ori     r8, r8, MSR_FP
+	oris    r8, r8, (MSR_VEC | MSR_VSX)@h
+	mtmsrd	r8
+
+	rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
+	beq	1f	/* TM not active in guest. */
+
+	std	r1, HSTATE_SCRATCH2(r13)
+	std	r3, HSTATE_SCRATCH1(r13)
+
+	/* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
+	mfcr	r6
+	SAVE_GPR(6, r1)
+
+	/* Save DSCR so we can restore it to avoid running with user value */
+	mfspr	r7, SPRN_DSCR
+	SAVE_GPR(7, r1)
+
+	/*
+	 * We are going to do treclaim., which will modify all checkpointed
+	 * registers.  Save the non-volatile registers on the stack if
+	 * preservation of non-volatile state has been requested.
+	 */
+	beq	cr7, 3f
+	SAVE_NVGPRS(r1)
+
+	/* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
+	li	r0, 0
+	rldimi	r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+	SAVE_GPR(10, r1)	/* final MSR value */
+3:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+BEGIN_FTR_SECTION
+	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
+	mfspr	r6, SPRN_TEXASR
+	std	r6, VCPU_ORIG_TEXASR(r3)
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
+#endif
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	li	r3, TM_CAUSE_KVM_RESCHED
+
+	/* All GPRs are volatile at this point. */
+	TRECLAIM(R3)
+
+	/* Temporarily store r13 and r9 so we have some regs to play with */
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9, PACATMSCRATCH(r13)
+	ld	r9, HSTATE_SCRATCH1(r13)
+
+	/* Save away PPR soon so we don't run with user value. */
+	std	r0, VCPU_GPRS_TM(0)(r9)
+	mfspr	r0, SPRN_PPR
+	HMT_MEDIUM
+
+	/* Reload stack pointer. */
+	std	r1, VCPU_GPRS_TM(1)(r9)
+	ld	r1, HSTATE_SCRATCH2(r13)
+
+	/* Set MSR RI now we have r1 and r13 back. */
+	std	r2, VCPU_GPRS_TM(2)(r9)
+	li	r2, MSR_RI
+	mtmsrd	r2, 1
+
+	/* Reload TOC pointer. */
+	LOAD_PACA_TOC()
+
+	/* Save all but r0-r2, r9 & r13 */
+	reg = 3
+	.rept	29
+	.if (reg != 9) && (reg != 13)
+	std	reg, VCPU_GPRS_TM(reg)(r9)
+	.endif
+	reg = reg + 1
+	.endr
+	/* ... now save r13 */
+	GET_SCRATCH0(r4)
+	std	r4, VCPU_GPRS_TM(13)(r9)
+	/* ... and save r9 */
+	ld	r4, PACATMSCRATCH(r13)
+	std	r4, VCPU_GPRS_TM(9)(r9)
+
+	/* Restore host DSCR and CR values, after saving guest values */
+	mfcr	r6
+	mfspr	r7, SPRN_DSCR
+	stw	r6, VCPU_CR_TM(r9)
+	std	r7, VCPU_DSCR_TM(r9)
+	REST_GPR(6, r1)
+	REST_GPR(7, r1)
+	mtcr	r6
+	mtspr	SPRN_DSCR, r7
+
+	/* Save away checkpointed SPRs. */
+	std	r0, VCPU_PPR_TM(r9)
+	mflr	r5
+	mfctr	r7
+	mfspr	r8, SPRN_AMR
+	mfspr	r10, SPRN_TAR
+	mfxer	r11
+	std	r5, VCPU_LR_TM(r9)
+	std	r7, VCPU_CTR_TM(r9)
+	std	r8, VCPU_AMR_TM(r9)
+	std	r10, VCPU_TAR_TM(r9)
+	std	r11, VCPU_XER_TM(r9)
+
+	/* Save FP/VSX. */
+	addi	r3, r9, VCPU_FPRS_TM
+	bl	store_fp_state
+	addi	r3, r9, VCPU_VRS_TM
+	bl	store_vr_state
+	mfspr	r6, SPRN_VRSAVE
+	stw	r6, VCPU_VRSAVE_TM(r9)
+
+	/* Restore non-volatile registers if requested to */
+	beq	cr7, 1f
+	REST_NVGPRS(r1)
+	REST_GPR(10, r1)
+1:
+	/*
+	 * We need to save these SPRs after the treclaim so that the software
+	 * error code is recorded correctly in the TEXASR.  Also the user may
+	 * change these outside of a transaction, so they must always be
+	 * context switched.
+	 */
+	mfspr	r7, SPRN_TEXASR
+	std	r7, VCPU_TEXASR(r9)
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+
+	/* Restore MSR state if requested */
+	beq	cr7, 2f
+	mtmsrd	r10, 0
+2:
+	addi	r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+/*
+ * _kvmppc_save_tm_pr() is a wrapper around __kvmppc_save_tm(), so that it can
+ * be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_save_tm_pr)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu    r1, -PPC_MIN_STKFRM(r1)
+
+	mfspr   r8, SPRN_TAR
+	std	r8, PPC_MIN_STKFRM-8(r1)
+
+	li	r5, 1		/* preserve non-volatile registers */
+	bl	__kvmppc_save_tm
+
+	ld	r8, PPC_MIN_STKFRM-8(r1)
+	mtspr   SPRN_TAR, r8
+
+	addi    r1, r1, PPC_MIN_STKFRM
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with:
+ *  - r3 pointing to the vcpu struct.
+ *  - r4 is the guest MSR with desired TS bits:
+ * 	For HV KVM, it is VCPU_MSR
+ * 	For PR KVM, it is provided by caller
+ * - r5 containing a flag indicating that non-volatile registers
+ *	must be preserved.
+ * If r5 == 0, this potentially modifies all checkpointed registers, but
+ * restores r1, r2 from the PACA before exit.
+ * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
+ */
+_GLOBAL(__kvmppc_restore_tm)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+
+	cmpdi	cr7, r5, 0
+
+	/* Turn on TM/FP/VSX/VMX so we can restore them. */
+	mfmsr	r5
+	mr	r10, r5
+	li	r6, MSR_TM >> 32
+	sldi	r6, r6, 32
+	or	r5, r5, r6
+	ori	r5, r5, MSR_FP
+	oris	r5, r5, (MSR_VEC | MSR_VSX)@h
+	mtmsrd	r5
+
+	/*
+	 * The user may change these outside of a transaction, so they must
+	 * always be context switched.
+	 */
+	ld	r5, VCPU_TFHAR(r3)
+	ld	r6, VCPU_TFIAR(r3)
+	ld	r7, VCPU_TEXASR(r3)
+	mtspr	SPRN_TFHAR, r5
+	mtspr	SPRN_TFIAR, r6
+	mtspr	SPRN_TEXASR, r7
+
+	mr	r5, r4
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beq	9f		/* TM not active in guest */
+
+	/* Make sure the failure summary is set, otherwise we'll program check
+	 * when we trechkpt.  It's possible that this might have been not set
+	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+	 * host.
+	 */
+	oris	r7, r7, (TEXASR_FS)@h
+	mtspr	SPRN_TEXASR, r7
+
+	/*
+	 * Make a stack frame and save non-volatile registers if requested.
+	 */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+	std	r1, HSTATE_SCRATCH2(r13)
+
+	mfcr	r6
+	mfspr	r7, SPRN_DSCR
+	SAVE_GPR(2, r1)
+	SAVE_GPR(6, r1)
+	SAVE_GPR(7, r1)
+
+	beq	cr7, 4f
+	SAVE_NVGPRS(r1)
+
+	/* MSR[TS] will be 1 (suspended) once we do trechkpt */
+	li	r0, 1
+	rldimi	r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+	SAVE_GPR(10, r1)	/* final MSR value */
+4:
+	/*
+	 * We need to load up the checkpointed state for the guest.
+	 * We need to do this early as it will blow away any GPRs, VSRs and
+	 * some SPRs.
+	 */
+
+	mr	r31, r3
+	addi	r3, r31, VCPU_FPRS_TM
+	bl	load_fp_state
+	addi	r3, r31, VCPU_VRS_TM
+	bl	load_vr_state
+	mr	r3, r31
+	lwz	r7, VCPU_VRSAVE_TM(r3)
+	mtspr	SPRN_VRSAVE, r7
+
+	ld	r5, VCPU_LR_TM(r3)
+	lwz	r6, VCPU_CR_TM(r3)
+	ld	r7, VCPU_CTR_TM(r3)
+	ld	r8, VCPU_AMR_TM(r3)
+	ld	r9, VCPU_TAR_TM(r3)
+	ld	r10, VCPU_XER_TM(r3)
+	mtlr	r5
+	mtcr	r6
+	mtctr	r7
+	mtspr	SPRN_AMR, r8
+	mtspr	SPRN_TAR, r9
+	mtxer	r10
+
+	/*
+	 * Load up PPR and DSCR values but don't put them in the actual SPRs
+	 * till the last moment to avoid running with userspace PPR and DSCR for
+	 * too long.
+	 */
+	ld	r29, VCPU_DSCR_TM(r3)
+	ld	r30, VCPU_PPR_TM(r3)
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* Load GPRs r0-r28 */
+	reg = 0
+	.rept	29
+	ld	reg, VCPU_GPRS_TM(reg)(r31)
+	reg = reg + 1
+	.endr
+
+	mtspr	SPRN_DSCR, r29
+	mtspr	SPRN_PPR, r30
+
+	/* Load final GPRs */
+	ld	29, VCPU_GPRS_TM(29)(r31)
+	ld	30, VCPU_GPRS_TM(30)(r31)
+	ld	31, VCPU_GPRS_TM(31)(r31)
+
+	/* TM checkpointed state is now setup.  All GPRs are now volatile. */
+	TRECHKPT
+
+	/* Now let's get back the state we need. */
+	HMT_MEDIUM
+	GET_PACA(r13)
+	ld	r1, HSTATE_SCRATCH2(r13)
+	REST_GPR(7, r1)
+	mtspr	SPRN_DSCR, r7
+
+	/* Set the MSR RI since we have our registers back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+
+	/* Restore TOC pointer and CR */
+	REST_GPR(2, r1)
+	REST_GPR(6, r1)
+	mtcr	r6
+
+	/* Restore non-volatile registers if requested to. */
+	beq	cr7, 5f
+	REST_GPR(10, r1)
+	REST_NVGPRS(r1)
+
+5:	addi	r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+
+9:	/* Restore MSR bits if requested */
+	beqlr	cr7
+	mtmsrd	r10, 0
+	blr
+
+/*
+ * _kvmppc_restore_tm_pr() is a wrapper around __kvmppc_restore_tm(), so that it
+ * can be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_restore_tm_pr)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu    r1, -PPC_MIN_STKFRM(r1)
+
+	/* save TAR so that it can be recovered later */
+	mfspr   r8, SPRN_TAR
+	std	r8, PPC_MIN_STKFRM-8(r1)
+
+	li	r5, 1
+	bl	__kvmppc_restore_tm
+
+	ld	r8, PPC_MIN_STKFRM-8(r1)
+	mtspr   SPRN_TAR, r8
+
+	addi    r1, r1, PPC_MIN_STKFRM
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
new file mode 100644
index 0000000000..ea1d7c8083
--- /dev/null
+++ b/arch/powerpc/kvm/trace.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_ppc_instr,
+	TP_PROTO(unsigned int inst, unsigned long _pc, unsigned int emulate),
+	TP_ARGS(inst, _pc, emulate),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	inst		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned int,	emulate		)
+	),
+
+	TP_fast_assign(
+		__entry->inst		= inst;
+		__entry->pc		= _pc;
+		__entry->emulate	= emulate;
+	),
+
+	TP_printk("inst %u pc 0x%lx emulate %u\n",
+		  __entry->inst, __entry->pc, __entry->emulate)
+);
+
+TRACE_EVENT(kvm_stlb_inval,
+	TP_PROTO(unsigned int stlb_index),
+	TP_ARGS(stlb_index),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	stlb_index	)
+	),
+
+	TP_fast_assign(
+		__entry->stlb_index	= stlb_index;
+	),
+
+	TP_printk("stlb_index %u", __entry->stlb_index)
+);
+
+TRACE_EVENT(kvm_stlb_write,
+	TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0,
+		 unsigned int word1, unsigned int word2),
+	TP_ARGS(victim, tid, word0, word1, word2),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	victim		)
+		__field(	unsigned int,	tid		)
+		__field(	unsigned int,	word0		)
+		__field(	unsigned int,	word1		)
+		__field(	unsigned int,	word2		)
+	),
+
+	TP_fast_assign(
+		__entry->victim		= victim;
+		__entry->tid		= tid;
+		__entry->word0		= word0;
+		__entry->word1		= word1;
+		__entry->word2		= word2;
+	),
+
+	TP_printk("victim %u tid %u w0 %u w1 %u w2 %u",
+		__entry->victim, __entry->tid, __entry->word0,
+		__entry->word1, __entry->word2)
+);
+
+TRACE_EVENT(kvm_gtlb_write,
+	TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0,
+		 unsigned int word1, unsigned int word2),
+	TP_ARGS(gtlb_index, tid, word0, word1, word2),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	gtlb_index	)
+		__field(	unsigned int,	tid		)
+		__field(	unsigned int,	word0		)
+		__field(	unsigned int,	word1		)
+		__field(	unsigned int,	word2		)
+	),
+
+	TP_fast_assign(
+		__entry->gtlb_index	= gtlb_index;
+		__entry->tid		= tid;
+		__entry->word0		= word0;
+		__entry->word1		= word1;
+		__entry->word2		= word2;
+	),
+
+	TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u",
+		__entry->gtlb_index, __entry->tid, __entry->word0,
+		__entry->word1, __entry->word2)
+);
+
+TRACE_EVENT(kvm_check_requests,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	cpu_nr		)
+		__field(	__u32,	requests	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_nr		= vcpu->vcpu_id;
+		__entry->requests	= vcpu->requests;
+	),
+
+	TP_printk("vcpu=%x requests=%x",
+		__entry->cpu_nr, __entry->requests)
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h
new file mode 100644
index 0000000000..372a82fa2d
--- /dev/null
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_BOOK3S_H)
+#define _TRACE_KVM_BOOK3S_H
+
+/*
+ * Common defines used by the trace macros in trace_pr.h and trace_hv.h
+ */
+
+#define kvm_trace_symbol_exit \
+	{0x100, "SYSTEM_RESET"}, \
+	{0x200, "MACHINE_CHECK"}, \
+	{0x300, "DATA_STORAGE"}, \
+	{0x380, "DATA_SEGMENT"}, \
+	{0x400, "INST_STORAGE"}, \
+	{0x480, "INST_SEGMENT"}, \
+	{0x500, "EXTERNAL"}, \
+	{0x502, "EXTERNAL_HV"}, \
+	{0x600, "ALIGNMENT"}, \
+	{0x700, "PROGRAM"}, \
+	{0x800, "FP_UNAVAIL"}, \
+	{0x900, "DECREMENTER"}, \
+	{0x980, "HV_DECREMENTER"}, \
+	{0xc00, "SYSCALL"}, \
+	{0xd00, "TRACE"}, \
+	{0xe00, "H_DATA_STORAGE"}, \
+	{0xe20, "H_INST_STORAGE"}, \
+	{0xe40, "H_EMUL_ASSIST"}, \
+	{0xf00, "PERFMON"}, \
+	{0xf20, "ALTIVEC"}, \
+	{0xf40, "VSX"}
+
+#endif
diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h
new file mode 100644
index 0000000000..eff6e82dbc
--- /dev/null
+++ b/arch/powerpc/kvm/trace_booke.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_BOOKE_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_booke
+
+#define kvm_trace_symbol_exit \
+	{0, "CRITICAL"}, \
+	{1, "MACHINE_CHECK"}, \
+	{2, "DATA_STORAGE"}, \
+	{3, "INST_STORAGE"}, \
+	{4, "EXTERNAL"}, \
+	{5, "ALIGNMENT"}, \
+	{6, "PROGRAM"}, \
+	{7, "FP_UNAVAIL"}, \
+	{8, "SYSCALL"}, \
+	{9, "AP_UNAVAIL"}, \
+	{10, "DECREMENTER"}, \
+	{11, "FIT"}, \
+	{12, "WATCHDOG"}, \
+	{13, "DTLB_MISS"}, \
+	{14, "ITLB_MISS"}, \
+	{15, "DEBUG"}, \
+	{32, "SPE_UNAVAIL"}, \
+	{33, "SPE_FP_DATA"}, \
+	{34, "SPE_FP_ROUND"}, \
+	{35, "PERFORMANCE_MONITOR"}, \
+	{36, "DOORBELL"}, \
+	{37, "DOORBELL_CRITICAL"}, \
+	{38, "GUEST_DBELL"}, \
+	{39, "GUEST_DBELL_CRIT"}, \
+	{40, "HV_SYSCALL"}, \
+	{41, "HV_PRIV"}
+
+TRACE_EVENT(kvm_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+		__field(	unsigned long,	last_inst	)
+	),
+
+	TP_fast_assign(
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= vcpu->arch.shared->msr;
+		__entry->last_inst	= vcpu->arch.last_inst;
+	),
+
+	TP_printk("exit=%s"
+		" | pc=0x%lx"
+		" | msr=0x%lx"
+		" | dar=0x%lx"
+		" | last_inst=0x%lx"
+		,
+		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+		__entry->pc,
+		__entry->msr,
+		__entry->dar,
+		__entry->last_inst
+		)
+);
+
+TRACE_EVENT(kvm_booke206_stlb_write,
+	TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
+	TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	mas0		)
+		__field(	__u32,	mas8		)
+		__field(	__u32,	mas1		)
+		__field(	__u64,	mas2		)
+		__field(	__u64,	mas7_3		)
+	),
+
+	TP_fast_assign(
+		__entry->mas0		= mas0;
+		__entry->mas8		= mas8;
+		__entry->mas1		= mas1;
+		__entry->mas2		= mas2;
+		__entry->mas7_3		= mas7_3;
+	),
+
+	TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
+		__entry->mas0, __entry->mas8, __entry->mas1,
+		__entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_gtlb_write,
+	TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
+	TP_ARGS(mas0, mas1, mas2, mas7_3),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	mas0		)
+		__field(	__u32,	mas1		)
+		__field(	__u64,	mas2		)
+		__field(	__u64,	mas7_3		)
+	),
+
+	TP_fast_assign(
+		__entry->mas0		= mas0;
+		__entry->mas1		= mas1;
+		__entry->mas2		= mas2;
+		__entry->mas7_3		= mas7_3;
+	),
+
+	TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
+		__entry->mas0, __entry->mas1,
+		__entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_ref_release,
+	TP_PROTO(__u64 pfn, __u32 flags),
+	TP_ARGS(pfn, flags),
+
+	TP_STRUCT__entry(
+		__field(	__u64,	pfn		)
+		__field(	__u32,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		= pfn;
+		__entry->flags		= flags;
+	),
+
+	TP_printk("pfn=%llx flags=%x",
+		__entry->pfn, __entry->flags)
+);
+
+#ifdef CONFIG_SPE_POSSIBLE
+#define kvm_trace_symbol_irqprio_spe \
+	{BOOKE_IRQPRIO_SPE_UNAVAIL, "SPE_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_SPE_FP_DATA, "SPE_FP_DATA"}, \
+	{BOOKE_IRQPRIO_SPE_FP_ROUND, "SPE_FP_ROUND"},
+#else
+#define kvm_trace_symbol_irqprio_spe
+#endif
+
+#ifdef CONFIG_PPC_E500MC
+#define kvm_trace_symbol_irqprio_e500mc \
+	{BOOKE_IRQPRIO_ALTIVEC_UNAVAIL, "ALTIVEC_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_ALTIVEC_ASSIST, "ALTIVEC_ASSIST"},
+#else
+#define kvm_trace_symbol_irqprio_e500mc
+#endif
+
+#define kvm_trace_symbol_irqprio \
+	kvm_trace_symbol_irqprio_spe \
+	kvm_trace_symbol_irqprio_e500mc \
+	{BOOKE_IRQPRIO_DATA_STORAGE, "DATA_STORAGE"}, \
+	{BOOKE_IRQPRIO_INST_STORAGE, "INST_STORAGE"}, \
+	{BOOKE_IRQPRIO_ALIGNMENT, "ALIGNMENT"}, \
+	{BOOKE_IRQPRIO_PROGRAM, "PROGRAM"}, \
+	{BOOKE_IRQPRIO_FP_UNAVAIL, "FP_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_SYSCALL, "SYSCALL"}, \
+	{BOOKE_IRQPRIO_AP_UNAVAIL, "AP_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_DTLB_MISS, "DTLB_MISS"}, \
+	{BOOKE_IRQPRIO_ITLB_MISS, "ITLB_MISS"}, \
+	{BOOKE_IRQPRIO_MACHINE_CHECK, "MACHINE_CHECK"}, \
+	{BOOKE_IRQPRIO_DEBUG, "DEBUG"}, \
+	{BOOKE_IRQPRIO_CRITICAL, "CRITICAL"}, \
+	{BOOKE_IRQPRIO_WATCHDOG, "WATCHDOG"}, \
+	{BOOKE_IRQPRIO_EXTERNAL, "EXTERNAL"}, \
+	{BOOKE_IRQPRIO_FIT, "FIT"}, \
+	{BOOKE_IRQPRIO_DECREMENTER, "DECREMENTER"}, \
+	{BOOKE_IRQPRIO_PERFORMANCE_MONITOR, "PERFORMANCE_MONITOR"}, \
+	{BOOKE_IRQPRIO_EXTERNAL_LEVEL, "EXTERNAL_LEVEL"}, \
+	{BOOKE_IRQPRIO_DBELL, "DBELL"}, \
+	{BOOKE_IRQPRIO_DBELL_CRIT, "DBELL_CRIT"} \
+
+TRACE_EVENT(kvm_booke_queue_irqprio,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
+	TP_ARGS(vcpu, priority),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	cpu_nr		)
+		__field(	__u32,	priority		)
+		__field(	unsigned long,	pending		)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_nr		= vcpu->vcpu_id;
+		__entry->priority	= priority;
+		__entry->pending	= vcpu->arch.pending_exceptions;
+	),
+
+	TP_printk("vcpu=%x prio=%s pending=%lx",
+		__entry->cpu_nr,
+		__print_symbolic(__entry->priority, kvm_trace_symbol_irqprio),
+		__entry->pending)
+);
+
+#endif
+
+/* This part must be outside protection */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_booke
+
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
new file mode 100644
index 0000000000..8d57c84285
--- /dev/null
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -0,0 +1,525 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_HV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_HV_H
+
+#include <linux/tracepoint.h>
+#include "trace_book3s.h"
+#include <asm/hvcall.h>
+#include <asm/kvm_asm.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_hv
+
+#define kvm_trace_symbol_hcall \
+	{H_REMOVE,			"H_REMOVE"}, \
+	{H_ENTER,			"H_ENTER"}, \
+	{H_READ,			"H_READ"}, \
+	{H_CLEAR_MOD,			"H_CLEAR_MOD"}, \
+	{H_CLEAR_REF,			"H_CLEAR_REF"}, \
+	{H_PROTECT,			"H_PROTECT"}, \
+	{H_GET_TCE,			"H_GET_TCE"}, \
+	{H_PUT_TCE,			"H_PUT_TCE"}, \
+	{H_SET_SPRG0,			"H_SET_SPRG0"}, \
+	{H_SET_DABR,			"H_SET_DABR"}, \
+	{H_PAGE_INIT,			"H_PAGE_INIT"}, \
+	{H_SET_ASR,			"H_SET_ASR"}, \
+	{H_ASR_ON,			"H_ASR_ON"}, \
+	{H_ASR_OFF,			"H_ASR_OFF"}, \
+	{H_LOGICAL_CI_LOAD,		"H_LOGICAL_CI_LOAD"}, \
+	{H_LOGICAL_CI_STORE,		"H_LOGICAL_CI_STORE"}, \
+	{H_LOGICAL_CACHE_LOAD,		"H_LOGICAL_CACHE_LOAD"}, \
+	{H_LOGICAL_CACHE_STORE,		"H_LOGICAL_CACHE_STORE"}, \
+	{H_LOGICAL_ICBI,		"H_LOGICAL_ICBI"}, \
+	{H_LOGICAL_DCBF,		"H_LOGICAL_DCBF"}, \
+	{H_GET_TERM_CHAR,		"H_GET_TERM_CHAR"}, \
+	{H_PUT_TERM_CHAR,		"H_PUT_TERM_CHAR"}, \
+	{H_REAL_TO_LOGICAL,		"H_REAL_TO_LOGICAL"}, \
+	{H_HYPERVISOR_DATA,		"H_HYPERVISOR_DATA"}, \
+	{H_EOI,				"H_EOI"}, \
+	{H_CPPR,			"H_CPPR"}, \
+	{H_IPI,				"H_IPI"}, \
+	{H_IPOLL,			"H_IPOLL"}, \
+	{H_XIRR,			"H_XIRR"}, \
+	{H_PERFMON,			"H_PERFMON"}, \
+	{H_MIGRATE_DMA,			"H_MIGRATE_DMA"}, \
+	{H_REGISTER_VPA,		"H_REGISTER_VPA"}, \
+	{H_CEDE,			"H_CEDE"}, \
+	{H_CONFER,			"H_CONFER"}, \
+	{H_PROD,			"H_PROD"}, \
+	{H_GET_PPP,			"H_GET_PPP"}, \
+	{H_SET_PPP,			"H_SET_PPP"}, \
+	{H_PURR,			"H_PURR"}, \
+	{H_PIC,				"H_PIC"}, \
+	{H_REG_CRQ,			"H_REG_CRQ"}, \
+	{H_FREE_CRQ,			"H_FREE_CRQ"}, \
+	{H_VIO_SIGNAL,			"H_VIO_SIGNAL"}, \
+	{H_SEND_CRQ,			"H_SEND_CRQ"}, \
+	{H_COPY_RDMA,			"H_COPY_RDMA"}, \
+	{H_REGISTER_LOGICAL_LAN,	"H_REGISTER_LOGICAL_LAN"}, \
+	{H_FREE_LOGICAL_LAN,		"H_FREE_LOGICAL_LAN"}, \
+	{H_ADD_LOGICAL_LAN_BUFFER,	"H_ADD_LOGICAL_LAN_BUFFER"}, \
+	{H_SEND_LOGICAL_LAN,		"H_SEND_LOGICAL_LAN"}, \
+	{H_BULK_REMOVE,			"H_BULK_REMOVE"}, \
+	{H_MULTICAST_CTRL,		"H_MULTICAST_CTRL"}, \
+	{H_SET_XDABR,			"H_SET_XDABR"}, \
+	{H_STUFF_TCE,			"H_STUFF_TCE"}, \
+	{H_PUT_TCE_INDIRECT,		"H_PUT_TCE_INDIRECT"}, \
+	{H_CHANGE_LOGICAL_LAN_MAC,	"H_CHANGE_LOGICAL_LAN_MAC"}, \
+	{H_VTERM_PARTNER_INFO,		"H_VTERM_PARTNER_INFO"}, \
+	{H_REGISTER_VTERM,		"H_REGISTER_VTERM"}, \
+	{H_FREE_VTERM,			"H_FREE_VTERM"}, \
+	{H_RESET_EVENTS,		"H_RESET_EVENTS"}, \
+	{H_ALLOC_RESOURCE,		"H_ALLOC_RESOURCE"}, \
+	{H_FREE_RESOURCE,		"H_FREE_RESOURCE"}, \
+	{H_MODIFY_QP,			"H_MODIFY_QP"}, \
+	{H_QUERY_QP,			"H_QUERY_QP"}, \
+	{H_REREGISTER_PMR,		"H_REREGISTER_PMR"}, \
+	{H_REGISTER_SMR,		"H_REGISTER_SMR"}, \
+	{H_QUERY_MR,			"H_QUERY_MR"}, \
+	{H_QUERY_MW,			"H_QUERY_MW"}, \
+	{H_QUERY_HCA,			"H_QUERY_HCA"}, \
+	{H_QUERY_PORT,			"H_QUERY_PORT"}, \
+	{H_MODIFY_PORT,			"H_MODIFY_PORT"}, \
+	{H_DEFINE_AQP1,			"H_DEFINE_AQP1"}, \
+	{H_GET_TRACE_BUFFER,		"H_GET_TRACE_BUFFER"}, \
+	{H_DEFINE_AQP0,			"H_DEFINE_AQP0"}, \
+	{H_RESIZE_MR,			"H_RESIZE_MR"}, \
+	{H_ATTACH_MCQP,			"H_ATTACH_MCQP"}, \
+	{H_DETACH_MCQP,			"H_DETACH_MCQP"}, \
+	{H_CREATE_RPT,			"H_CREATE_RPT"}, \
+	{H_REMOVE_RPT,			"H_REMOVE_RPT"}, \
+	{H_REGISTER_RPAGES,		"H_REGISTER_RPAGES"}, \
+	{H_DISABLE_AND_GET,		"H_DISABLE_AND_GET"}, \
+	{H_ERROR_DATA,			"H_ERROR_DATA"}, \
+	{H_GET_HCA_INFO,		"H_GET_HCA_INFO"}, \
+	{H_GET_PERF_COUNT,		"H_GET_PERF_COUNT"}, \
+	{H_MANAGE_TRACE,		"H_MANAGE_TRACE"}, \
+	{H_GET_CPU_CHARACTERISTICS,	"H_GET_CPU_CHARACTERISTICS"}, \
+	{H_FREE_LOGICAL_LAN_BUFFER,	"H_FREE_LOGICAL_LAN_BUFFER"}, \
+	{H_QUERY_INT_STATE,		"H_QUERY_INT_STATE"}, \
+	{H_POLL_PENDING,		"H_POLL_PENDING"}, \
+	{H_ILLAN_ATTRIBUTES,		"H_ILLAN_ATTRIBUTES"}, \
+	{H_MODIFY_HEA_QP,		"H_MODIFY_HEA_QP"}, \
+	{H_QUERY_HEA_QP,		"H_QUERY_HEA_QP"}, \
+	{H_QUERY_HEA,			"H_QUERY_HEA"}, \
+	{H_QUERY_HEA_PORT,		"H_QUERY_HEA_PORT"}, \
+	{H_MODIFY_HEA_PORT,		"H_MODIFY_HEA_PORT"}, \
+	{H_REG_BCMC,			"H_REG_BCMC"}, \
+	{H_DEREG_BCMC,			"H_DEREG_BCMC"}, \
+	{H_REGISTER_HEA_RPAGES,		"H_REGISTER_HEA_RPAGES"}, \
+	{H_DISABLE_AND_GET_HEA,		"H_DISABLE_AND_GET_HEA"}, \
+	{H_GET_HEA_INFO,		"H_GET_HEA_INFO"}, \
+	{H_ALLOC_HEA_RESOURCE,		"H_ALLOC_HEA_RESOURCE"}, \
+	{H_ADD_CONN,			"H_ADD_CONN"}, \
+	{H_DEL_CONN,			"H_DEL_CONN"}, \
+	{H_JOIN,			"H_JOIN"}, \
+	{H_VASI_STATE,			"H_VASI_STATE"}, \
+	{H_ENABLE_CRQ,			"H_ENABLE_CRQ"}, \
+	{H_GET_EM_PARMS,		"H_GET_EM_PARMS"}, \
+	{H_GET_ENERGY_SCALE_INFO,	"H_GET_ENERGY_SCALE_INFO"}, \
+	{H_SET_MPP,			"H_SET_MPP"}, \
+	{H_GET_MPP,			"H_GET_MPP"}, \
+	{H_HOME_NODE_ASSOCIATIVITY,	"H_HOME_NODE_ASSOCIATIVITY"}, \
+	{H_BEST_ENERGY,			"H_BEST_ENERGY"}, \
+	{H_XIRR_X,			"H_XIRR_X"}, \
+	{H_RANDOM,			"H_RANDOM"}, \
+	{H_COP,				"H_COP"}, \
+	{H_GET_MPP_X,			"H_GET_MPP_X"}, \
+	{H_SET_MODE,			"H_SET_MODE"}, \
+	{H_REGISTER_PROC_TBL,		"H_REGISTER_PROC_TBL"}, \
+	{H_QUERY_VAS_CAPABILITIES,	"H_QUERY_VAS_CAPABILITIES"}, \
+	{H_INT_GET_SOURCE_INFO,		"H_INT_GET_SOURCE_INFO"}, \
+	{H_INT_SET_SOURCE_CONFIG,	"H_INT_SET_SOURCE_CONFIG"}, \
+	{H_INT_GET_QUEUE_INFO,		"H_INT_GET_QUEUE_INFO"}, \
+	{H_INT_SET_QUEUE_CONFIG,	"H_INT_SET_QUEUE_CONFIG"}, \
+	{H_INT_ESB,			"H_INT_ESB"}, \
+	{H_INT_RESET,			"H_INT_RESET"}, \
+	{H_RPT_INVALIDATE,		"H_RPT_INVALIDATE"}, \
+	{H_RTAS,			"H_RTAS"}, \
+	{H_LOGICAL_MEMOP,		"H_LOGICAL_MEMOP"}, \
+	{H_CAS,				"H_CAS"}, \
+	{H_UPDATE_DT,			"H_UPDATE_DT"}, \
+	{H_GET_PERF_COUNTER_INFO,	"H_GET_PERF_COUNTER_INFO"}, \
+	{H_SET_PARTITION_TABLE,		"H_SET_PARTITION_TABLE"}, \
+	{H_ENTER_NESTED,		"H_ENTER_NESTED"}, \
+	{H_TLB_INVALIDATE,		"H_TLB_INVALIDATE"}, \
+	{H_COPY_TOFROM_GUEST,		"H_COPY_TOFROM_GUEST"}
+
+
+#define kvm_trace_symbol_kvmret \
+	{RESUME_GUEST,			"RESUME_GUEST"}, \
+	{RESUME_GUEST_NV,		"RESUME_GUEST_NV"}, \
+	{RESUME_HOST,			"RESUME_HOST"}, \
+	{RESUME_HOST_NV,		"RESUME_HOST_NV"}
+
+#define kvm_trace_symbol_hcall_rc \
+	{H_SUCCESS,			"H_SUCCESS"}, \
+	{H_BUSY,			"H_BUSY"}, \
+	{H_CLOSED,			"H_CLOSED"}, \
+	{H_NOT_AVAILABLE,		"H_NOT_AVAILABLE"}, \
+	{H_CONSTRAINED,			"H_CONSTRAINED"}, \
+	{H_PARTIAL,			"H_PARTIAL"}, \
+	{H_IN_PROGRESS,			"H_IN_PROGRESS"}, \
+	{H_PAGE_REGISTERED,		"H_PAGE_REGISTERED"}, \
+	{H_PARTIAL_STORE,		"H_PARTIAL_STORE"}, \
+	{H_PENDING,			"H_PENDING"}, \
+	{H_CONTINUE,			"H_CONTINUE"}, \
+	{H_LONG_BUSY_START_RANGE,	"H_LONG_BUSY_START_RANGE"}, \
+	{H_LONG_BUSY_ORDER_1_MSEC,	"H_LONG_BUSY_ORDER_1_MSEC"}, \
+	{H_LONG_BUSY_ORDER_10_MSEC,	"H_LONG_BUSY_ORDER_10_MSEC"}, \
+	{H_LONG_BUSY_ORDER_100_MSEC,	"H_LONG_BUSY_ORDER_100_MSEC"}, \
+	{H_LONG_BUSY_ORDER_1_SEC,	"H_LONG_BUSY_ORDER_1_SEC"}, \
+	{H_LONG_BUSY_ORDER_10_SEC,	"H_LONG_BUSY_ORDER_10_SEC"}, \
+	{H_LONG_BUSY_ORDER_100_SEC,	"H_LONG_BUSY_ORDER_100_SEC"}, \
+	{H_LONG_BUSY_END_RANGE,		"H_LONG_BUSY_END_RANGE"}, \
+	{H_TOO_HARD,			"H_TOO_HARD"}, \
+	{H_HARDWARE,			"H_HARDWARE"}, \
+	{H_FUNCTION,			"H_FUNCTION"}, \
+	{H_PRIVILEGE,			"H_PRIVILEGE"}, \
+	{H_PARAMETER,			"H_PARAMETER"}, \
+	{H_BAD_MODE,			"H_BAD_MODE"}, \
+	{H_PTEG_FULL,			"H_PTEG_FULL"}, \
+	{H_NOT_FOUND,			"H_NOT_FOUND"}, \
+	{H_RESERVED_DABR,		"H_RESERVED_DABR"}, \
+	{H_NO_MEM,			"H_NO_MEM"}, \
+	{H_AUTHORITY,			"H_AUTHORITY"}, \
+	{H_PERMISSION,			"H_PERMISSION"}, \
+	{H_DROPPED,			"H_DROPPED"}, \
+	{H_SOURCE_PARM,			"H_SOURCE_PARM"}, \
+	{H_DEST_PARM,			"H_DEST_PARM"}, \
+	{H_REMOTE_PARM,			"H_REMOTE_PARM"}, \
+	{H_RESOURCE,			"H_RESOURCE"}, \
+	{H_ADAPTER_PARM,		"H_ADAPTER_PARM"}, \
+	{H_RH_PARM,			"H_RH_PARM"}, \
+	{H_RCQ_PARM,			"H_RCQ_PARM"}, \
+	{H_SCQ_PARM,			"H_SCQ_PARM"}, \
+	{H_EQ_PARM,			"H_EQ_PARM"}, \
+	{H_RT_PARM,			"H_RT_PARM"}, \
+	{H_ST_PARM,			"H_ST_PARM"}, \
+	{H_SIGT_PARM,			"H_SIGT_PARM"}, \
+	{H_TOKEN_PARM,			"H_TOKEN_PARM"}, \
+	{H_MLENGTH_PARM,		"H_MLENGTH_PARM"}, \
+	{H_MEM_PARM,			"H_MEM_PARM"}, \
+	{H_MEM_ACCESS_PARM,		"H_MEM_ACCESS_PARM"}, \
+	{H_ATTR_PARM,			"H_ATTR_PARM"}, \
+	{H_PORT_PARM,			"H_PORT_PARM"}, \
+	{H_MCG_PARM,			"H_MCG_PARM"}, \
+	{H_VL_PARM,			"H_VL_PARM"}, \
+	{H_TSIZE_PARM,			"H_TSIZE_PARM"}, \
+	{H_TRACE_PARM,			"H_TRACE_PARM"}, \
+	{H_MASK_PARM,			"H_MASK_PARM"}, \
+	{H_MCG_FULL,			"H_MCG_FULL"}, \
+	{H_ALIAS_EXIST,			"H_ALIAS_EXIST"}, \
+	{H_P_COUNTER,			"H_P_COUNTER"}, \
+	{H_TABLE_FULL,			"H_TABLE_FULL"}, \
+	{H_ALT_TABLE,			"H_ALT_TABLE"}, \
+	{H_MR_CONDITION,		"H_MR_CONDITION"}, \
+	{H_NOT_ENOUGH_RESOURCES,	"H_NOT_ENOUGH_RESOURCES"}, \
+	{H_R_STATE,			"H_R_STATE"}, \
+	{H_RESCINDED,			"H_RESCINDED"}, \
+	{H_P2,				"H_P2"}, \
+	{H_P3,				"H_P3"}, \
+	{H_P4,				"H_P4"}, \
+	{H_P5,				"H_P5"}, \
+	{H_P6,				"H_P6"}, \
+	{H_P7,				"H_P7"}, \
+	{H_P8,				"H_P8"}, \
+	{H_P9,				"H_P9"}, \
+	{H_TOO_BIG,			"H_TOO_BIG"}, \
+	{H_OVERLAP,			"H_OVERLAP"}, \
+	{H_INTERRUPT,			"H_INTERRUPT"}, \
+	{H_BAD_DATA,			"H_BAD_DATA"}, \
+	{H_NOT_ACTIVE,			"H_NOT_ACTIVE"}, \
+	{H_SG_LIST,			"H_SG_LIST"}, \
+	{H_OP_MODE,			"H_OP_MODE"}, \
+	{H_COP_HW,			"H_COP_HW"}, \
+	{H_UNSUPPORTED_FLAG_START,	"H_UNSUPPORTED_FLAG_START"}, \
+	{H_UNSUPPORTED_FLAG_END,	"H_UNSUPPORTED_FLAG_END"}, \
+	{H_MULTI_THREADS_ACTIVE,	"H_MULTI_THREADS_ACTIVE"}, \
+	{H_OUTSTANDING_COP_OPS,		"H_OUTSTANDING_COP_OPS"}
+
+TRACE_EVENT(kvm_guest_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	pc)
+		__field(unsigned long,  pending_exceptions)
+		__field(u8,		ceded)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu->vcpu_id;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->ceded		= vcpu->arch.ceded;
+		__entry->pending_exceptions  = vcpu->arch.pending_exceptions;
+	),
+
+	TP_printk("VCPU %d: pc=0x%lx pexcp=0x%lx ceded=%d",
+			__entry->vcpu_id,
+			__entry->pc,
+			__entry->pending_exceptions, __entry->ceded)
+);
+
+TRACE_EVENT(kvm_guest_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(int,		trap)
+		__field(unsigned long,	pc)
+		__field(unsigned long,	msr)
+		__field(u8,		ceded)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->trap	 = vcpu->arch.trap;
+		__entry->ceded	 = vcpu->arch.ceded;
+		__entry->pc	 = kvmppc_get_pc(vcpu);
+		__entry->msr	 = vcpu->arch.shregs.msr;
+	),
+
+	TP_printk("VCPU %d: trap=%s pc=0x%lx msr=0x%lx, ceded=%d",
+		__entry->vcpu_id,
+		__print_symbolic(__entry->trap, kvm_trace_symbol_exit),
+		__entry->pc, __entry->msr, __entry->ceded
+	)
+);
+
+TRACE_EVENT(kvm_page_fault_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned long *hptep,
+		 struct kvm_memory_slot *memslot, unsigned long ea,
+		 unsigned long dsisr),
+
+	TP_ARGS(vcpu, hptep, memslot, ea, dsisr),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	hpte_v)
+		__field(unsigned long,	hpte_r)
+		__field(unsigned long,	gpte_r)
+		__field(unsigned long,	ea)
+		__field(u64,		base_gfn)
+		__field(u32,		slot_flags)
+		__field(u32,		dsisr)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->hpte_v	  = hptep[0];
+		__entry->hpte_r	  = hptep[1];
+		__entry->gpte_r	  = hptep[2];
+		__entry->ea	  = ea;
+		__entry->dsisr	  = dsisr;
+		__entry->base_gfn = memslot ? memslot->base_gfn : -1UL;
+		__entry->slot_flags = memslot ? memslot->flags : 0;
+	),
+
+	TP_printk("VCPU %d: hpte=0x%lx:0x%lx guest=0x%lx ea=0x%lx,%x slot=0x%llx,0x%x",
+		   __entry->vcpu_id,
+		   __entry->hpte_v, __entry->hpte_r, __entry->gpte_r,
+		   __entry->ea, __entry->dsisr,
+		   __entry->base_gfn, __entry->slot_flags)
+);
+
+TRACE_EVENT(kvm_page_fault_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned long *hptep, long ret),
+
+	TP_ARGS(vcpu, hptep, ret),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	hpte_v)
+		__field(unsigned long,	hpte_r)
+		__field(long,		ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->hpte_v	= hptep[0];
+		__entry->hpte_r	= hptep[1];
+		__entry->ret = ret;
+	),
+
+	TP_printk("VCPU %d: hpte=0x%lx:0x%lx ret=0x%lx",
+		   __entry->vcpu_id,
+		   __entry->hpte_v, __entry->hpte_r, __entry->ret)
+);
+
+TRACE_EVENT(kvm_hcall_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	req)
+		__field(unsigned long,	gpr4)
+		__field(unsigned long,	gpr5)
+		__field(unsigned long,	gpr6)
+		__field(unsigned long,	gpr7)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->req   = kvmppc_get_gpr(vcpu, 3);
+		__entry->gpr4  = kvmppc_get_gpr(vcpu, 4);
+		__entry->gpr5  = kvmppc_get_gpr(vcpu, 5);
+		__entry->gpr6  = kvmppc_get_gpr(vcpu, 6);
+		__entry->gpr7  = kvmppc_get_gpr(vcpu, 7);
+	),
+
+	TP_printk("VCPU %d: hcall=%s GPR4-7=0x%lx,0x%lx,0x%lx,0x%lx",
+		   __entry->vcpu_id,
+		   __print_symbolic(__entry->req, kvm_trace_symbol_hcall),
+		   __entry->gpr4, __entry->gpr5, __entry->gpr6, __entry->gpr7)
+);
+
+TRACE_EVENT(kvm_hcall_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int ret),
+
+	TP_ARGS(vcpu, ret),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	ret)
+		__field(unsigned long,	hcall_rc)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->ret	  = ret;
+		__entry->hcall_rc = kvmppc_get_gpr(vcpu, 3);
+	),
+
+	TP_printk("VCPU %d: ret=%s hcall_rc=%s",
+		   __entry->vcpu_id,
+		   __print_symbolic(__entry->ret, kvm_trace_symbol_kvmret),
+		   __print_symbolic(__entry->ret & RESUME_FLAG_HOST ?
+					H_TOO_HARD : __entry->hcall_rc,
+					kvm_trace_symbol_hcall_rc))
+);
+
+TRACE_EVENT(kvmppc_run_core,
+	TP_PROTO(struct kvmppc_vcore *vc, int where),
+
+	TP_ARGS(vc, where),
+
+	TP_STRUCT__entry(
+		__field(int,	n_runnable)
+		__field(int,	runner_vcpu)
+		__field(int,	where)
+		__field(pid_t,	tgid)
+	),
+
+	TP_fast_assign(
+		__entry->runner_vcpu	= vc->runner->vcpu_id;
+		__entry->n_runnable	= vc->n_runnable;
+		__entry->where		= where;
+		__entry->tgid		= current->tgid;
+	),
+
+	TP_printk("%s runner_vcpu==%d runnable=%d tgid=%d",
+		    __entry->where ? "Exit" : "Enter",
+		    __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_vcore_blocked,
+	TP_PROTO(struct kvm_vcpu *vcpu, int where),
+
+	TP_ARGS(vcpu, where),
+
+	TP_STRUCT__entry(
+		__field(int,	n_runnable)
+		__field(int,	runner_vcpu)
+		__field(int,	where)
+		__field(pid_t,	tgid)
+	),
+
+	TP_fast_assign(
+		__entry->runner_vcpu = vcpu->vcpu_id;
+		__entry->n_runnable  = vcpu->arch.vcore->n_runnable;
+		__entry->where       = where;
+		__entry->tgid	     = current->tgid;
+	),
+
+	TP_printk("%s runner_vcpu=%d runnable=%d tgid=%d",
+		   __entry->where ? "Exit" : "Enter",
+		   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_vcore_wakeup,
+	TP_PROTO(int do_sleep, __u64 ns),
+
+	TP_ARGS(do_sleep, ns),
+
+	TP_STRUCT__entry(
+		__field(__u64,  ns)
+		__field(int,    waited)
+		__field(pid_t,  tgid)
+	),
+
+	TP_fast_assign(
+		__entry->ns     = ns;
+		__entry->waited = do_sleep;
+		__entry->tgid   = current->tgid;
+	),
+
+	TP_printk("%s time %llu ns, tgid=%d",
+		__entry->waited ? "wait" : "poll",
+		__entry->ns, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_run_vcpu_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(pid_t,		tgid)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->tgid	  = current->tgid;
+	),
+
+	TP_printk("VCPU %d: tgid=%d", __entry->vcpu_id, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_run_vcpu_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(int,		exit)
+		__field(int,		ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->exit     = vcpu->run->exit_reason;
+		__entry->ret      = vcpu->arch.ret;
+	),
+
+	TP_printk("VCPU %d: exit=%d, ret=%d",
+			__entry->vcpu_id, __entry->exit, __entry->ret)
+);
+
+#endif /* _TRACE_KVM_HV_H */
+
+/* This part must be outside protection */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_hv
+
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
new file mode 100644
index 0000000000..46a46d328f
--- /dev/null
+++ b/arch/powerpc/kvm/trace_pr.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_PR_H
+
+#include <linux/tracepoint.h>
+#include "trace_book3s.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_pr
+
+TRACE_EVENT(kvm_book3s_reenter,
+	TP_PROTO(int r, struct kvm_vcpu *vcpu),
+	TP_ARGS(r, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	r		)
+		__field(	unsigned long,	pc		)
+	),
+
+	TP_fast_assign(
+		__entry->r		= r;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+	),
+
+	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
+);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+TRACE_EVENT(kvm_book3s_64_mmu_map,
+	TP_PROTO(int rflags, ulong hpteg, ulong va, kvm_pfn_t hpaddr,
+		 struct kvmppc_pte *orig_pte),
+	TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
+
+	TP_STRUCT__entry(
+		__field(	unsigned char,		flag_w		)
+		__field(	unsigned char,		flag_x		)
+		__field(	unsigned long,		eaddr		)
+		__field(	unsigned long,		hpteg		)
+		__field(	unsigned long,		va		)
+		__field(	unsigned long long,	vpage		)
+		__field(	unsigned long,		hpaddr		)
+	),
+
+	TP_fast_assign(
+		__entry->flag_w	= ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
+		__entry->flag_x	= (rflags & HPTE_R_N) ? '-' : 'x';
+		__entry->eaddr	= orig_pte->eaddr;
+		__entry->hpteg	= hpteg;
+		__entry->va	= va;
+		__entry->vpage	= orig_pte->vpage;
+		__entry->hpaddr	= hpaddr;
+	),
+
+	TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
+		  __entry->flag_w, __entry->flag_x, __entry->eaddr,
+		  __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
+);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+TRACE_EVENT(kvm_book3s_mmu_map,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_vpn	)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_vpn	= pte->host_vpn;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_vpn, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_invalidate,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_vpn	)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_vpn	= pte->host_vpn;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_vpn, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_flush,
+	TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
+		 unsigned long long p2),
+	TP_ARGS(type, vcpu, p1, p2),
+
+	TP_STRUCT__entry(
+		__field(	int,			count		)
+		__field(	unsigned long long,	p1		)
+		__field(	unsigned long long,	p2		)
+		__field(	const char *,		type		)
+	),
+
+	TP_fast_assign(
+		__entry->count		= to_book3s(vcpu)->hpte_cache_count;
+		__entry->p1		= p1;
+		__entry->p2		= p2;
+		__entry->type		= type;
+	),
+
+	TP_printk("Flush %d %sPTEs: %llx - %llx",
+		  __entry->count, __entry->type, __entry->p1, __entry->p2)
+);
+
+TRACE_EVENT(kvm_book3s_slb_found,
+	TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
+	TP_ARGS(gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	gvsid		)
+		__field(	unsigned long long,	hvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->gvsid		= gvsid;
+		__entry->hvsid		= hvsid;
+	),
+
+	TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_fail,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
+	TP_ARGS(sid_map_mask, gvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	gvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->gvsid		= gvsid;
+	),
+
+	TP_printk("%x/%x: %llx", __entry->sid_map_mask,
+		  SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_map,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
+		 unsigned long long hvsid),
+	TP_ARGS(sid_map_mask, gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	guest_vsid	)
+		__field(	unsigned long long,	host_vsid	)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->guest_vsid	= gvsid;
+		__entry->host_vsid	= hvsid;
+	),
+
+	TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
+		  __entry->guest_vsid, __entry->host_vsid)
+);
+
+TRACE_EVENT(kvm_book3s_slbmte,
+	TP_PROTO(u64 slb_vsid, u64 slb_esid),
+	TP_ARGS(slb_vsid, slb_esid),
+
+	TP_STRUCT__entry(
+		__field(	u64,	slb_vsid	)
+		__field(	u64,	slb_esid	)
+	),
+
+	TP_fast_assign(
+		__entry->slb_vsid	= slb_vsid;
+		__entry->slb_esid	= slb_esid;
+	),
+
+	TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
+);
+
+TRACE_EVENT(kvm_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+		__field(	unsigned long,	srr1		)
+		__field(	unsigned long,	last_inst	)
+	),
+
+	TP_fast_assign(
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= kvmppc_get_msr(vcpu);
+		__entry->srr1		= vcpu->arch.shadow_srr1;
+		__entry->last_inst	= vcpu->arch.last_inst;
+	),
+
+	TP_printk("exit=%s"
+		" | pc=0x%lx"
+		" | msr=0x%lx"
+		" | dar=0x%lx"
+		" | srr1=0x%lx"
+		" | last_inst=0x%lx"
+		,
+		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+		__entry->pc,
+		__entry->msr,
+		__entry->dar,
+		__entry->srr1,
+		__entry->last_inst
+		)
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_pr
+
+#include <trace/define_trace.h>
-- 
cgit v1.2.3